#~*~*~*~**~*~**~~*##REPLICATION CODE FOR KINSHIP INTERLOCKS PAPER!!##~*~**~*~*~*~*#############

####~*~**~*~*~*~*#I have been working on this project for nearly a decade, with the result that~*~*~*####
####~*~*~*~* even though I've tried to trim this down, much of the code is inefficient and~*~*~*######
####~**~*~*~*~*some of it is unnecessary for this paper. Feel free to improve and ~*~*~*###########
####~**~*~*~*~*please let me know if you find any terrible mistakes!!!!*~*~*~*~*~*###################

## LOADING LIBRARIES! #do i use all of these? not sure
library(dplyr)
library(ggplot2)
library(openxlsx)
library(igraph)
library(tidyr)
library(tidygraph)
library(ggraph)
library(purrr)
library(tibble)
library(kableExtra)
library(scales) 
library(modelsummary)
library(ggVennDiagram)
library(eulerr)

##set your working directory to wherever you've put the replication files
setwd()

##loading the main the spreadsheets
ppl_raw <- read.xlsx("blue2.xlsx", sheet = 1, startRow = 1, colNames = TRUE)
rel_raw <- read.xlsx("blue2.xlsx", sheet = 2, startRow = 1, colNames = TRUE)
orgs <- read.xlsx("blue2.xlsx", sheet = 3, startRow = 1, colNames = TRUE)
members_raw <- read.xlsx("blue2.xlsx", sheet = 4, startRow = 1, colNames = TRUE)
social_years <- read.xlsx("blue2.xlsx", sheet = 5, startRow = 1, colNames = TRUE)

#loading the elites spreadsheets for the kinship interlocks part! sooOoo ugly here but convenient for data entry!!
mayors_raw <- read.xlsx("polelites.xlsx", sheet = 1, startRow = 1, colNames = TRUE)
ltgovs_raw <- read.xlsx("polelites.xlsx", sheet = 10, startRow = 1, colNames = TRUE)
citycouncil_raw <- read.xlsx("polelites.xlsx", sheet = 2, startRow = 1, colNames = TRUE)
sheriffs_raw <- read.xlsx("polelites.xlsx", sheet = 3, startRow = 1, colNames = TRUE)
daljudges_raw <- read.xlsx("polelites.xlsx",  sheet = 4, startRow = 1, colNames = TRUE)
uscongdal_raw <- read.xlsx("polelites.xlsx",  sheet = 13, startRow = 1, colNames = TRUE)
txgovs_raw <- read.xlsx("polelites.xlsx",  sheet = 9, startRow = 1, colNames = TRUE)
schoolboard_raw <- read.xlsx("polelites.xlsx",  sheet = 5, startRow = 1, colNames = TRUE)
dalgovatts_raw <- read.xlsx("polelites.xlsx",  sheet = 6, startRow = 1, colNames = TRUE)
presdalbar_raw <- read.xlsx("polelites.xlsx",  sheet = 7, startRow = 1, colNames = TRUE)
txcongdal_raw<- read.xlsx("polelites.xlsx", sheet = 14, startRow = 1, colNames = TRUE)
ussentx_raw<- read.xlsx("polelites.xlsx", sheet = 8, startRow = 1, colNames = TRUE)
txsc_raw <- read.xlsx("polelites.xlsx", sheet = 11, startRow = 1, colNames = TRUE)
pcmayors_raw <- read.xlsx("polelites.xlsx", sheet = 12, startRow = 1, colNames = TRUE)
rrcom_raw<- read.xlsx("polelites.xlsx", sheet = 15, startRow = 1, colNames = TRUE)
utboard_raw<- read.xlsx("polelites.xlsx", sheet = 16,  startRow = 1, colNames = TRUE)
utstudpres_raw<- read.xlsx("polelites.xlsx", sheet = 17, startRow = 1, colNames = TRUE)
txsecstate_raw <- read.xlsx("polelites.xlsx", sheet = 18,  startRow = 1, colNames = TRUE)
ops_raw <- read.xlsx("econelites.xlsx", sheet = 15, startRow = 1, colNames = TRUE)
ins28 <- read.xlsx("econelites.xlsx", sheet = 6, startRow = 1, colNames = TRUE)
ins29 <- read.xlsx("econelites.xlsx", sheet = 7, startRow = 1, colNames = TRUE)
ins33 <- read.xlsx("econelites.xlsx", sheet = 8, startRow = 1, colNames = TRUE)
ins30 <- read.xlsx("econelites.xlsx", sheet = 9, startRow = 1, colNames = TRUE)
realest1877 <- read.xlsx("econelites.xlsx", sheet = 10, startRow = 1, colNames = TRUE)
bequests_en_raw <- read.xlsx("econelites.xlsx", sheet = 11, startRow = 1, colNames = TRUE)
bequests_p_raw <- read.xlsx("econelites.xlsx", sheet = 12, startRow = 1, colNames = TRUE)
mil1892_02_raw <- read.xlsx("econelites.xlsx", sheet = 2, startRow = 1, colNames = TRUE)
fortune57_raw <- read.xlsx("econelites.xlsx", sheet = 3, startRow = 1, colNames = TRUE)
tx100_89_raw <- read.xlsx("econelites.xlsx", sheet = 4, startRow = 1, colNames = TRUE)
estates_raw <- read.xlsx("econelites.xlsx", sheet = 5, startRow = 1, colNames = TRUE)
base_cpi <- 915.6 ##this is for 2023 using minneapolis fed chart. just for converting wealth data from diff years to a standard scale
cpis <- read.xlsx("econelites.xlsx", sheet = 1, startRow = 1, colNames = TRUE)
linz_raw <- read.xlsx("econelites.xlsx", sheet = 13, startRow = 1, colNames = TRUE)
land1860_raw <- read.xlsx("econelites.xlsx", sheet = 14, startRow = 1, colNames = TRUE)


###force ids to be characters
ppl_raw$id <- as.character(ppl_raw$id)
rel_raw$p1 <- as.character(rel_raw$p1)
rel_raw$p2 <- as.character(rel_raw$p2)
members_raw$id <- as.character(members_raw$id)
members_raw$org_id <- as.character(members_raw$org_id)

--------------------------------------------------------------------------------------------------------
  ## functions to load
  
##function to find gender. lol
find_gender <- function(id) {
    gender <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "woman")]
    return(gender)
  }

##function to find religion.....*~*~*
find_religion <- function(id) {
  religion <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "religion")]
  return(religion)
}

##see if someone is member of group
is_member <- function(id, groupid) {
  index <- (members[,1] == id & members[,2] == groupid)
  answer <- ifelse(sum(index) == 0, 0, 1)
  return(answer)
}

#find birth year
find_birthyear <- function(id) {
  byear <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "byear")]
  return(byear)
}

#find death year
find_deathyear <- function(id) {
  dyear <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "dyear")]
  return(dyear)
}

##finds person's family cluster
find_cluster <- function(id) {
  cluster <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "cluster")]
  return(cluster)
}


##takes two person ids and figures out if they're in the same family cluster
are_related  <- function(ida, idb) {
  clustera <- find_cluster(ida)
  clusterb <- find_cluster(idb)
  answer <- ifelse(clustera == clusterb, 1, 0)
  return(answer)
}

##finds shortest relative path between 2 ppl in rel graph
rel_chain <- function(ida, idb) {
  if(are_related(ida, idb) == FALSE) {
    answer <- "Not related"}
  else {
    chain <- (shortest_paths(bbg, from=as.character(ida), 
                             to=as.character(idb)))$vpath
    answer <- attributes(chain[[1]])$names
  }
  return(answer)
}

##finds edgetype from rel list for two connected vertices
##if not connected directly returns NA
find_edgetype <- function(ida, idb) {
  index <- (rel[,1] == ida & rel[,2] == idb) | 
    (rel[,1] == idb & rel[,2] == ida)
  type <- rel[index, 3]
  answer <- ifelse(length(type) == 0, NA, type)
  return(answer)
}


##takes in two connected vertices and spits 
##out shortest edgetype chain
rel_chain_edgetypes <- function(ida, idb) {
  relchain <- rel_chain(ida, idb)
  lenrec <- length(relchain) - 1 ##length of edgetypes is one fewer
  rce <- c()
  for (i in 1:lenrec) {
    rce[i] <- find_edgetype(relchain[i], relchain[i+1])
  }
  return(rce)
}


##takes in id and spits out dad if dad is in edgelist
##if dad isn't in there returns NA
find_dad <- function(kid_id) {
  index <- rel[,2] == kid_id & rel[,3] == "parentchild" & rel[,4] == 0
  ifelse(sum(index) == 0, dad_id <- 0, dad_id <- rel[index, 1])
  dad <- unique(dad_id)
  answer <- ifelse(dad == 0, NA, dad)
  return(answer)
}


##takes in id and spits out mom if she is in edgelist
##if mom isn't in there returns NA
find_mom <- function(kid_id) {
  index <- rel[,2] == kid_id & rel[,3] == "parentchild" & rel[,4] == 1
  ifelse(sum(index) == 0, mom_id <- 0, mom_id <- rel[index, 1])
  mom <- unique(mom_id)
  answer <- ifelse(mom == 0, NA, mom)
  return(answer)
}

##takes in id and spits out vector of parents if they're in there
##and if they aren't returns NA
find_parents <- function(id) {
  index <- (rel[,2] == id & rel[,3] == "parentchild") 
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), par_id <- NA, par_id <- rel[index, 1])
  parents <- unique(par_id)
  answer <- ifelse(parents == 0, NA, parents)
  return(answer)
}



##takes in id and spits out vector of sibs if they're in there
##and if they aren't returns NA
##this finds both half and full sibs
find_sib <- function(id) {
  #first find ones that are directly listed as siblings
  index <- ((rel[,1] == id | rel[,2] == id) & (rel[,3] == "sibling" | rel[,3] == "halfsiblingmat" | rel[,3] == "halfsiblingpat"))
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), sib_id <- NA, 
         sib_id <- c(rel[index, 2], rel[index, 1]))
  sibs <- unique(sib_id)
  sibs_d <- sibs[!sibs==id]
  
  #then find ones through parents
  parents <- find_parents(id)
  if (!is.na(parents[1])) {
    sib <- unlist(sapply(parents, FUN = find_kids))
    sibs2 <- unique(sib[!is.na(sib)])
    sibs_p <- sibs2[sibs2 != id]
    #sibs3 <- ifelse(length(sibss) == 0, NA, sibss)
  } else {
    sibs_p <- NA
  }
  
  ##put them together- remove NAs if there are
  ##ids in there, and return NA if there arent
  a1 <- c(sibs_d, sibs_p)
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    answer <- a1
  }
  return(unique(answer))
}



##takes in id and spits out vector of maternal sibs if they're in there
##and if they aren't returns NA
find_sib_mat <- function(id) {
  #first find ones that are directly listed as siblings
  index <- ((rel[,1] == id | rel[,2] == id) & (rel[,3] == "halfsiblingmat" | rel[,3] == "sibling"))
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), sib_id <- NA, 
         sib_id <- c(rel[index, 2], rel[index, 1]))
  sibs <- unique(sib_id)
  sibs_d <- sibs[!sibs==id]
  
  #then find ones through mom
  mom <- find_mom(id)
  if (!is.na(mom)) {
    sib <- unlist(sapply(mom, FUN = find_kids))
    sibs2 <- unique(sib[!is.na(sib)])
    sibs_p <- sibs2[sibs2 != id]
    #sibs3 <- ifelse(length(sibss) == 0, NA, sibss)
  } else {
    sibs_p <- NA
  }
  
  ##put them together- remove NAs if there are
  ##ids in there, and return NA if there arent
  a1 <- c(sibs_d, sibs_p)
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    answer <- a1
  }
  return(unique(answer))
}

##takes in id and spits out vector of maternal sibs if they're in there
##and if they aren't returns NA
find_sib_pat <- function(id) {
  #first find ones that are directly listed as siblings
  index <- ((rel[,1] == id | rel[,2] == id) & (rel[,3] == "halfsiblingpat" | rel[,3] == "sibling"))
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), sib_id <- NA, 
         sib_id <- c(rel[index, 2], rel[index, 1]))
  sibs <- unique(sib_id)
  sibs_d <- sibs[!sibs==id]
  
  #then find ones through mom
  dad <- find_dad(id)
  if (!is.na(dad)) {
    sib <- unlist(sapply(dad, FUN = find_kids))
    sibs2 <- unique(sib[!is.na(sib)])
    sibs_p <- sibs2[sibs2 != id]
    #sibs3 <- ifelse(length(sibss) == 0, NA, sibss)
  } else {
    sibs_p <- NA
  }
  
  ##put them together- remove NAs if there are
  ##ids in there, and return NA if there arent
  a1 <- c(sibs_d, sibs_p)
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    answer <- a1
  }
  return(unique(answer))
}



##takes in id and spits out vector of kids if they're in there
##and if they aren't returns NA
find_kids <- function(id) {
  index <- (rel[,1] == id & rel[,3] == "parentchild") 
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), kid_id <- NA, kid_id <- rel[index, 2])
  kids <- unique(kid_id)
  answer <- ifelse(kids == 0, NA, kids)
  return(answer)
}



##finds sons
find_sons <- function(id) {
  kids <- find_kids(id)
  if (is.na(kids)[1]) {
    answer <- NA
  } else {
    sons <- kids[find_gender(kids) == 0]
    if (length(sons) == 0) {
      answer <- NA
    } else {
      answer <- sons
    }
  }
  return(answer)
}

##finds daughters
find_daughters <- function(id) {
  kids <- find_kids(id)
  if (is.na(kids)[1]) {
    answer <- NA
  } else {
    sons <- kids[find_gender(kids) == 1]
    if (length(sons) == 0) {
      answer <- NA
    } else {
      answer <- sons
    }
  }
  return(answer)
}



##finds patrilineal grandsons
find_pgsons <- function(id) {
  sons <- find_sons(id)
  if (is.na(sons)[1]) {
    answer <- NA
  } else {
    pg <- unlist(sapply(sons, FUN = find_sons))
    pgs <- pg[!is.na(pg)]
    if (length(pgs) == 0) {
      answer <- NA
    } else {
      answer <- pgs
    }
  }
  return(answer)
}

##finds patrilineal descendants
find_pds <- function(id) {
  sons <- find_sons(id)
  #try(if(is.na(kids)) stop("No kids in dataset"))
  desc <- list()
  desc$sons <- sons
  
  gk <- unlist(sapply(sons, FUN = find_sons))
  desc$grandsons <- gk[!is.na(gk)]
  
  ggk <- unlist(sapply(desc$grandsons, FUN = find_sons))
  desc$greatgrandsons <- ggk[!is.na(ggk)]
  
  gg3 <- unlist(sapply(desc$greatgrandsons, FUN = find_sons))
  desc$greatgreatgrandsons <- gg3[!is.na(gg3)]
  
  gg4 <- unlist(sapply(desc$greatgreatgrandsons, FUN = find_sons))
  desc$great3grandsons <- gg4[!is.na(gg4)]
  
  gg5 <- unlist(sapply(desc$great3grandsons, FUN = find_sons))
  desc$great4grandsons <- gg5[!is.na(gg5)]
  return(desc)
}

##finds patrilineal descendants- but unlisted
find_pds_ul <- function(id) {
  sons <- find_sons(id)
  #try(if(is.na(kids)) stop("No kids in dataset"))
  
  gk <- unlist(sapply(sons, FUN = find_sons))
  gks <- gk[!is.na(gk)]
  
  ggk <- unlist(sapply(gks, FUN = find_sons))
  ggks <- ggk[!is.na(ggk)]
  
  g3k <- unlist(sapply(ggks, FUN = find_sons))
  g3ks <- g3k[!is.na(g3k)]
  
  g4k <- unlist(sapply(g3ks, FUN = find_sons))
  g4ks <- g4k[!is.na(g4k)]
  
  a1 <- c(sons, gks, ggks, g3ks, g4ks)
  
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    if (all(is.na(a1))) {
      answer <- NA
    } else {
      answer <- a1
    }
  }
  
  return(a1)
}



#takes in id and spits out vector of grandparents if
#they're in there and if not returns NA
find_gparents <- function(id) {
  parents <- find_parents(id)
  if (is.na(parents)[1]) {
    index <- (rel[,2] == id & rel[,3] == "grandparentgrandchild")
    sumind <- sum(index)
    ifelse((is.na(sumind) | (sumind ==0)), gp_id <- NA, 
           gp_id <- c(rel[index, 2], rel[index, 1]))
    gp <- unique(gp_id)
    gp <- gp[!gp==id]
    answer <- ifelse(gp == 0, NA, gp)
  } else {
    gp <- unlist(sapply(parents, FUN = find_parents))
    gparents <- gp[!is.na(gp)]
    if (length(gparents) == 0) {
      answer <- NA
    } else {
      answer <- gparents
    }
  }
  return(answer)
}

#takes in id and spits out vector of grandkids if
#they're in there and if not returns NA
find_gkids <- function(id) {
  kids <- find_kids(id)
  if (is.na(kids)[1]) {
    index <- (rel[,1] == id & rel[,3] == "grandparentgrandchild")
    sumind <- sum(index)
    ifelse((is.na(sumind) | (sumind ==0)), gk_id <- NA, 
           gk_id <- c(rel[index, 2], rel[index, 1]))
    gk <- unique(gk_id)
    gk <- gk[!gk==id]
    answer <- ifelse(gk == 0, NA, gk)
  } else {
    gk <- unlist(sapply(kids, FUN = find_kids))
    gks <- gk[!is.na(gk)]
    if (length(gks) == 0) {
      answer <- NA
    } else {
      answer <- gks
    }
  }
  return(answer)
}


#takes in id and spits out vector of aunts
#and uncles if they're in there and if not returns NA
find_auntuncles <- function(id) {
  # First, try to find parents
  parents <- find_parents(id)
  
  # Initialize an empty vector to collect aunt and uncle IDs
  au_id <- c()
  
  # If there are parents, find aunts and uncles through siblings of the parents
  if (!is.na(parents)[1]) {
    # Get siblings of each parent
    au <- unlist(sapply(parents, FUN = find_sib))
    # Filter out NA values and add to the au_id list
    au_id <- c(au_id, au[!is.na(au)])
  }
  
  # Find aunts and uncles directly related to the 'id' in the relationship edgelist
  # Logical vector where id is a niece or nephew in the relationship
  index <- (rel[,2] == id & rel[,3] == "auntuncleniecenephew")
  
  # Add aunt and uncle IDs found directly in the relationship list
  au_id <- c(au_id, rel[index, 1])
  
  # Filter unique, non-NA values
  aus <- unique(au_id[!is.na(au_id)])
  
  # Return NA if no aunts or uncles found, otherwise return the list of IDs
  if (length(aus) == 0) {
    return(NA)
  } else {
    return(aus)
  }
}



#takes in id and spits out vector of nieces and
#nephews and if they're not in there returns NA
find_nns <- function(id) {
  #first find ones that are directly listed as nieces/nephews
  index <- (rel[,1] == id & rel[,3] == "auntuncleniecenephew")
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), nn_id <- NA, 
         nn_id <- rel[index, 2])
  nn_d <- unique(nn_id)
  
  #then find them through sibs
  
  sibs <- find_sib(id)
  if (!is.na(sibs)[1]) {
    nn_s <- unlist(sapply(sibs, FUN = find_kids))
    nn_s2 <- nn_s[!is.na(nn_s)]
  } else {
    nn_s2 <- NA 
  }
  
  a1 <- c(nn_d, nn_s2)
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    if (all(is.na(a1))) {
      answer <- NA
    } else {
      answer <- a1
    }
  }
  
  return(unique(answer))
}

#takes in id and spits out vector of first cousins
#and NA if there aren't any or if they're already in
#there but not connected thru aunt/uncle
find_cousinsf <- function(id) {
  aus <- find_auntuncles(id)
  if (is.na(aus)[1]) {
    answer <- NA
  } else {
    cou <- unlist(sapply(aus, FUN = find_kids))
    cous <- cou[!is.na(cou)]
    if (length(cous) == 0) {
      answer <- NA
    } else {
      answer <- cous
    }
  }
  return(answer)
}

##takes in id and spits out vector of spouses
##if none in there returns NA
find_spouses <- function(id) {
  index <- ((rel[,1] == id | rel[,2] == id) & rel[,3] == "spouse")
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), sp_id <- NA, 
         sp_id <- c(rel[index, 2], rel[index, 1]))
  spouses <- unique(sp_id)
  answer <- ifelse(spouses == 0, NA, spouses[!spouses==id])
  return(unique(answer))
}

#finding the industry that dad works in. 
#uses ppl2 which is in the newmoney file
find_dadjob <- function(id) {
  dad <- find_dad(id)
  if (is.na(dad)) {
    answer <- NA
  } else {
    answer <- as.character(ppl2[na.omit(match(dad, ppl2$id)), which(colnames(ppl2) == "indp1")])
  }
  return(answer)
}



##FOR WOMEN ONLY bc of name structure
##takes in id and spits out first husband if
##in dataset
find_fhusband <- function(id) {
  spouses <- find_spouses(id)
  if (is.na(spouses)[1]) {
    answer <- NA
  } else {
    if (length(spouses) == 1) {
      answer <- spouses
    } else {
      penult <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "last_penult")]
      lasts <- unlist(sapply(spouses, FUN = find_lname))
      ans <- spouses[which(lasts == penult)]
      answer <- ifelse(length(ans)==0, NA, ans)
    }
  }
  return(answer)
}

##for finding first husbands job 
find_hubjob <- function(id) {
  hub <- find_fhusband(id)
  if (is.na(hub)) {
    answer <- NA
  } else {
    answer <- as.character(ppl2[na.omit(match(hub, ppl2$id)), which(colnames(ppl2) == "indp1")])
  }
  return(answer)
}

##find all descendants up to three gens, including ones through siblings
find_all_desc <- function(id) {
  kids <- find_kids(id)
  
  gkids <- find_gkids(id)
  
  ggk <- unlist(sapply(gkids, FUN = find_kids))
  ggks <- ggk[!is.na(ggk)]
  
  g3 <- unlist(sapply(ggks, FUN = find_kids))
  g3s <- g3[!is.na(g3)]
  
  nns <- find_nns(id)
  
  gnn <- unlist(sapply(nns, FUN = find_kids))
  gnns <- gnn[!is.na(gnn)]
  
  ggnn <- unlist(sapply(gnns, FUN = find_kids))
  ggnns <- ggnn[!is.na(ggnn)]
  
  g3nn <- unlist(sapply(ggnns, FUN = find_kids))
  g3nns <- g3nn[!is.na(g3nn)]
  
  a1 <- c(kids, gkids, ggks, g3s, nns, gnns, ggnns, g3nns)
  
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    if (all(is.na(a1))) {
      answer <- NA
    } else {
      answer <- a1
    }
  }
  
  return(answer)
}

#finds number of all known descendants in SR, including colineals
num_d <- function(id) {
  desc <- unlist(find_all_desc(id))
  srd <- desc[desc %in% sr$id]
  num <- length(unique(srd))
  return(num)
}


##function that finds the total number of social registers someone is 
##in- after the sr df has been created bc it uses that
find_srsum <- function(id) {
  srsum <- pplsr[na.omit(match(id, pplsr$id)), 
                 which(colnames(pplsr) == "srsum")]
  return(srsum)
}


#this function finds the first year a person was in the social
#registers by using the sr dataframe and the dataframe that 
#attaches sr years to years. first it checks to see if the 
#person is in them at all- if not it returns NA. if they are
#in it, it finds first column of SRs that has a 1 and then 
#looks that column name up in the social years table
find_minsr <- function(id) {
  srsum <- find_srsum(id)
  if(srsum == 0) {
    answer <- NA
  }
  else {
    srve <- pplsr[na.omit(match(id, pplsr$id)),] 
    srvec <- subset(srve, select = -c(srsum))
    mincolnum <- min(which(srvec == 1))
    mincolname <- colnames(pplsr)[mincolnum]
    answer <- social_years[match(mincolname, 
                                 social_years$column), 2]
  }
  return(answer)
}

#this function finds the first year a person was in the social
#registers by using the sr dataframe and the dataframe that 
#attaches sr years to years. first it checks to see if the 
#person is in them at all- if not it returns NA. if they are
#in it, it finds first column of SRs that has a 1 and then 
#looks that column name up in the social years table

find_maxsr <- function(id) {
  srsum <- find_srsum(id)
  if(srsum == 0) {
    answer <- NA
  }
  else {
    srve <- pplsr[na.omit(match(id, pplsr$id)),] 
    srvec <- subset(srve, select = -c(srsum))
    maxcolnum <- max(which(srvec == 1))
    maxcolname <- colnames(pplsr)[maxcolnum]
    answer <- social_years[match(maxcolname, 
                                 social_years$column), 2]
  }
  return(answer)
}



#should rewrite this to deal with NAs a bit better but
#it is what it is for now- tells u if a person is alive in a certain
#year
is_alive <- function(id, year) {
  birth <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "byear")]
  death <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "dyear")]
  answer <- ifelse((birth <= year & death >= year) |
                     (birth <= year & is.na(death) & birth > 1930) , 1, 0)
  return(answer)
}


##function for adding sibs to rel file
add_sibs <- function(id) {
  sibs <- sibss[na.omit(match(id, sibss$id)), which(colnames(sibss) == "sibs")]
  sibs2 <- unlist(sibs)
  sibs3 <- expand.grid(id, sibs2)
  colnames(sibs3) <- c("p1", "p2")
  sibs3$edgetype <- "sibling"
  return(sibs3)
}

##function for adding gparents to rel file
add_gparents <- function(id) {
  gp <- gpar[na.omit(match(id, gpar$id)), which(colnames(gpar) == "gparents")]
  gp2 <- unlist(gp)
  gp3 <- expand.grid(gp2, id)
  colnames(gp3) <- c("p1", "p2")
  gp3$edgetype <- "grandparentgrandchild"
  return(gp3)
}

#function for adding aunts and uncles to rel file
add_auntuncles <- function(id) {
  au <- auntunc[na.omit(match(id, auntunc$id)), which(colnames(auntunc) == "auntuncles")]
  au2 <- unlist(au)
  au3 <- expand.grid(au2, id)
  colnames(au3) <- c("p1", "p2")
  au3$edgetype <- "auntuncleniecenephew"
  return(au3)
}

add_cousins <- function(id) {
  co <- cousins[na.omit(match(id, cousins$id)), which(colnames(cousins) == "fcousins")]
  co2 <- unlist(co)
  co3 <- expand.grid(co2, id)
  colnames(co3) <- c("p1", "p2")
  co3$edgetype <- "cousinsfirst"
  return(co3)
}



##function to look up value in given column name from ppl
find_col <- function(id, colname) {
  answer <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == colname)]
  return(answer)
}


##function to get first name
find_fname <- function(id) {
  fname <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "first")]
  return(fname)
}

##function to get last name
find_lname <- function(id) {
  lname <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "last")]
  return(lname)
}

##function to get maiden name
find_mname <- function(id) {
  mname <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "maiden")]
  return(mname)
}

##function to get maiden name
find_midname <- function(id) {
  mname <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "middle")]
  return(mname)
}


#function to get arrival year
find_arrival <- function(id) {
  arrival <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "arrive_dallas")]
  return(arrival)
}

#function to get coreness
find_core <- function(id) {
  kcore <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "kcore")]
  return(kcore)
}

##sees if someone was born in dallas or dallas county
born_dallas <- function(id) {
  btown <-  ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "birthtown")]
  bcounty <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "birthcounty")]
  birthstate <- ppl[na.omit(match(id, ppl$id)), which(colnames(ppl) == "birthstate")]
  answer <- ifelse(((btown == "Dallas" | bcounty == "Dallas") & (birthstate == "TX")), TRUE, FALSE)
  return(answer)
}


#finds minimum dallas byear of sibs. if there is none returns NA
sibs_mindalby <- function(id) {
  sibs <- find_sib(id)
  answer <- ifelse(is.na(sibs), NA, sibs[born_dallas(sibs)])
  ans2 <- answer[!is.na(answer)]
  bys <- ifelse(length(ans2) == 0, NA, min(find_birthyear(ans2), na.rm = TRUE))
  return(bys)
}

#finds minimum dallas byear of kids. if there is none returns NA
kids_mindalby <- function(id) {
  kids <- find_kids(id)
  answer <- ifelse(is.na(kids), NA, kids[born_dallas(kids)])
  ans2 <- answer[!is.na(answer)]
  bys <- ifelse(length(ans2) == 0, NA, min(find_birthyear(ans2), na.rm = TRUE))
  return(bys)
}



#this function finds the descendants in the dataset, assuming there
#arent more than greatgreatgreatgreatgrandkids in there
find_descendants <- function(id) {
  kids <- find_kids(id)
  #try(if(is.na(kids)) stop("No kids in dataset"))
  desc <- list()
  desc$kids <- kids
  
  gk <- unlist(sapply(kids, FUN = find_kids))
  desc$grandkids <- gk[!is.na(gk)]
  
  ggk <- unlist(sapply(desc$grandkids, FUN = find_kids))
  desc$greatgrandkids <- ggk[!is.na(ggk)]
  
  gg3 <- unlist(sapply(desc$greatgrandkids, FUN = find_kids))
  desc$greatgreatgrandkids <- gg3[!is.na(gg3)]
  
  gg4 <- unlist(sapply(desc$greatgreatgrandkids, FUN = find_kids))
  desc$great3grandkids <- gg4[!is.na(gg4)]
  
  gg5 <- unlist(sapply(desc$great3grandkids, FUN = find_kids))
  desc$great4grandkids <- gg5[!is.na(gg5)]
  return(desc)
}



#finds number of descendants in sr
find_nd_sr <- function(id) {
  desc <- unlist(find_descendants(id))
  srd <- desc[desc %in% sr$id]
  num <- length(srd)
  return(num)
}

#finds ancestors up to greatgreatgreat grandparents
find_ancestors <- function(id) {
  parents <- find_parents(id)
  anc <- list()
  anc$parents <- parents
  
  gp <- unlist(sapply(parents, FUN = find_parents))
  anc$grandparents <- gp[!is.na(gp)]
  
  ggp <- unlist(sapply(anc$grandparents, FUN = find_parents))
  anc$greatgrandparents <- ggp[!is.na(ggp)]
  
  g3 <- unlist(sapply(anc$greatgrandparents, FUN = find_parents))
  anc$greatgreatgrandparents <- g3[!is.na(g3)]
  
  g4 <- unlist(sapply(anc$greatgreatgrandparents, FUN = find_parents))
  anc$greatgreatgreatgrandparents <- g4[!is.na(g4)]
  
  return(anc)
}



#finds number of ancestors in sr
find_na_sr <- function(id) {
  anc <- unlist(find_ancestors(id))
  sra <- anc[anc %in% sr$id]
  num <- length(sra)
  return(num)
}

#generates list of direct ties for a person
find_rels <- function(id) {
  idx <- match(id, V(bbg)$name)
  E(bbg)[from(idx)]
}

#finds out how many sr elders someone has in dataset
##this thing is extremely slow- only use for individuals until it's fixed
num_sr_elders <- function(id) {
  num_lineals <- find_na_sr(id)
  auntunc <- ifelse(is.na(find_auntuncles(id)), 0, find_auntuncles(id))
  num_auntunc <- length(auntunc[auntunc %in% sr$id])
  index <- (rel[,2] == id & rel[,3] == "greatauntunclegreatniecenephew" | 
              rel[,2] == id & rel[,3] == "grandparentgrandchild")
  sumind <- sum(index)
  others <- ifelse((is.na(sumind) | (sumind ==0)), 0, rel[index, 1])
  num_others <- length(others[others %in% sr$id])
  total <- num_lineals + num_auntunc + num_others
  return(total)
}



#number of sons
num_sons <- function(id) {
  sons <- unlist(find_sons(id))
  if (is.na(sons[1])) {
    num <- 0
  } else {
    num <- length(unique(sons))
  }
  return(num)
}

#number of daughters
num_dau <- function(id) {
  dau <- unlist(find_daughters(id))
  if (is.na(dau[1])) {
    num <- 0
  } else {
    num <- length(unique(dau))
  }
  return(num)
}

##number of nieces and nephews (by blood not marriage)
num_nns <- function(id) {
  sons <- unlist(find_nns(id))
  if (is.na(sons[1])) {
    num <- 0
  } else {
    num <- length(unique(sons))
  }
  return(num)
}

id <- 98852

##function to find sisters
find_sis <- function(id) {
  #first find ones that are directly listed as siblings
  index <- ((rel[,1] == id | rel[,2] == id) & rel[,3] == "sibling")
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), sis_id <- NA, 
         sis_id <- c(rel[index, 2], rel[index, 1]))
  siss <- unique(sis_id)
  siss_ <- siss[!siss==id]
  siss_d <- siss_[find_gender(siss_)==1] #makes sure it's only sisters
  if (length(siss_d) == 0) { 
    sis_f <- NA
  } else {
    sis_f <- siss_d
  }
  
  #then find ones through parents
  parents <- find_parents(id)
  if (!is.na(parents[1])) {
    sib <- unlist(sapply(parents, FUN = find_daughters))
    sibs2 <- unique(sib[!is.na(sib)])
    sibs_p <- sibs2[sibs2 != id]
    #sibs3 <- ifelse(length(sibss) == 0, NA, sibss)
  } else {
    sibs_p <- NA
  }
  
  ##put them together- remove NAs if there are
  ##ids in there, and return NA if there arent
  a1 <- c(sis_f, sibs_p)
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    answer <- a1
  }
  return(unique(answer))
}

##function to find brothers
find_bros <- function(id) {
  #first find ones that are directly listed as siblings
  index <- ((rel[,1] == id | rel[,2] == id) & rel[,3] == "sibling")
  sumind <- sum(index)
  ifelse((is.na(sumind) | (sumind ==0)), sis_id <- NA, 
         sis_id <- c(rel[index, 2], rel[index, 1]))
  siss <- unique(sis_id)
  siss_ <- siss[!siss==id]
  siss_d <- siss_[find_gender(siss_)==0] #makes sure it's only sisters
  if (length(siss_d) == 0) { 
    sis_f <- NA
  } else {
    sis_f <- siss_d
  }
  
  #then find ones through parents
  parents <- find_parents(id)
  if (!is.na(parents[1])) {
    sib <- unlist(sapply(parents, FUN = find_sons))
    sibs2 <- unique(sib[!is.na(sib)])
    sibs_p <- sibs2[sibs2 != id]
  } else {
    sibs_p <- NA
  }
  
  ##put them together- remove NAs if there are
  ##ids in there, and return NA if there arent
  a1 <- c(sis_f, sibs_p)
  if ((length(a1) > 1) & any(!is.na(a1))) {
    answer <- a1[!is.na(a1)]
  } else {
    answer <- a1
  }
  return(unique(answer))
}

##count sisters
num_sis <- function(id) {
  sis <- unlist(find_sis(id))
  if (is.na(sis[1])) {
    num <- 0
  } else {
    num <- length(unique(sis))
  }
  return(num)
}

##count brothers
num_bro <- function(id) {
  bro <- unlist(find_bros(id))
  if (is.na(bro[1])) {
    num <- 0
  } else {
    num <- length(unique(bro))
  }
  return(num)
}

##function to find org name
find_orgname <- function(orgid) {
  orgname <- orgs[na.omit(match(orgid, orgs$org_id)), which(colnames(orgs) == "name")]
  return(orgname)
}

##function that says name byear dyear of person
whois <- function(id) {
  stopifnot("`id` is not in dataset." = (id %in% ppl$id))
  
  first <- find_fname(id)
  middle <- find_midname(id)
  maiden <- find_mname(id)
  last <- find_lname(id)
  byear <- find_birthyear(id)
  dyear <- find_deathyear(id)
  
  answer <- paste0(first, " ", middle, " ", maiden, " ", last, " (", byear, "-",
                   dyear, ")")
  return(answer)
}



##function to explain in text the relationship between two ppl
##when their kin distance is 1
kintie <- function(p1, p2) {
  ##these will stop the function if they weren't related or if they were more than
  ##one kin tie apart
  stopifnot("They have no known kin relation." = are_related(p1, p2)==1)
  stopifnot("They are more than one kin tie apart." = length(rel_chain(p1, p2)) == 2)
  
  ##getting all the info we need abotu the two ppl to categorize further
  gentype <- rel_chain_edgetypes(p1, p2)
  p1_by <- find_birthyear(p1)
  p2_by <- find_birthyear(p2)
  p1_dy <- find_deathyear(p1)
  p2_dy <- find_deathyear(p2)
  p1_g <- find_gender(p1)
  p2_g <- find_gender(p2)
  p1_fn <- find_fname(p1)
  p2_fn <- find_fname(p2)
  p1_ln <- find_lname(p1)
  p2_ln <- find_lname(p2)
  
  ##slotting general types into finer categories based on byears and genders
  type2 <- case_when((p1_by > p2_by) & p2_g== 1 & 
                       gentype == "parentchild" ~ "mom",
                     (p1_by > p2_by) & p2_g== 0 & 
                       gentype == "parentchild" ~ "dad",
                     (p1_by < p2_by) & p2_g== 0 & 
                       gentype == "parentchild" ~ "son",
                     (p1_by < p2_by) & p2_g== 1 & 
                       gentype == "parentchild" ~ "daughter",
                     (p1_by > p2_by) & p2_g== 1 & 
                       gentype == "sibling" ~ "older sister",
                     (p1_by > p2_by) & p2_g== 0 & 
                       gentype == "sibling" ~ "older brother",
                     (p1_by < p2_by) & p2_g== 0 & 
                       gentype == "sibling"  ~ "younger brother",
                     (p1_by < p2_by) & p2_g== 1 & 
                       gentype == "sibling" ~ "younger sister",
                     (p1_by > p2_by) & p2_g== 1 & 
                       gentype == "grandparentgrandchild" ~ "grandma",
                     (p1_by > p2_by) & p2_g== 0 & 
                       gentype == "grandparentgrandchild" ~ "grandpa",
                     (p1_by < p2_by) & p2_g== 0 & 
                       gentype == "grandparentgrandchild" ~ "grandson",
                     (p1_by < p2_by) & p2_g== 1 & 
                       gentype == "grandparentgrandchild" ~ "granddaughter",
                     (p1_by > p2_by) & p2_g== 1 & 
                       gentype == "auntuncleniecenephew" ~ "aunt",
                     (p1_by > p2_by) & p2_g== 0 & 
                       gentype == "auntuncleniecenephew" ~ "uncle",
                     (p1_by < p2_by) & p2_g== 0 & 
                       gentype == "auntuncleniecenephew" ~ "nephew",
                     (p1_by < p2_by) & p2_g== 1 & 
                       gentype == "auntuncleniecenephew" ~ "niece",
                     p2_g== 1 & 
                       gentype == "spouse" ~ "wife",
                     p2_g== 0 & 
                       gentype == "spouse" ~ "husband",
                     TRUE ~ gentype)
  
  ##printing the answer in easy to read format
  answer <- paste0(p2_fn, " ", p2_ln," (", p2,") ", 
                   "was ", p1_fn, " ", p1_ln," (", p1,")'s ",
                   type2, ".")
  print(answer)
}

##function to get immediate kin ties of a person
kingraph <- function(id, order) {
  id <- as.character(id)
  ego <- make_ego_graph(bbg2, order= order, c(id))
  
  plot(ego[[1]],
       vertex.label.font = 1, 
       vertex.label.color = "black",
       vertex.label.family = "Helvetica",
       vertex.label.cex = .75,
       vertex.size = 15,
       edge.color = 'gray9',
       edge.arrow.size = .1,
       main.family = "Arial",
       vertex.shape = "circle",
       edge.label.font = 1, 
       edge.label.color = "darkgreen",
       edge.label.family = "Helvetica",
       edge.label.cex = .5,
       main = "Ego Graph",
       layout = layout_nicely(ego[[1]]))
}

##also assume we have all the kindistances
kindist_within <- function(idvec) {
  ##takes in a vector of ids
  
  ##count how long it is
  veclength <- length(idvec)
  posspairs <- veclength^2
  
  ##remove isolates and see percentage that are isolates
  vec2 <- as.character(idvec[!idvec %in% isos$id])
  perc_iso <- 1 - length(vec2)/veclength
  iso_pairs <- (veclength-length(vec2))*length(vec2)
  
  ##now subset the kin distnaces matrix to get little one
  kd_1a <- kin_dists[vec2, vec2] ##makes subset of kin distances matrix- incl all pairs twice and diag of 0
  kd_1 <- kd_1a[lower.tri(kd_1a)] ##gets just the lower triangular part of the matrix 
  kd_2 <- kd_1[kd_1 != Inf] ##removes infinite values bc those are disconnected and now it's a vector
  num_infs <- length(kd_1[kd_1==Inf]) ##finds out how many pairs had an infinite
  meankd <- mean(kd_2)
  
  ##percent of pairs that are unconnected
  perc_uc <- (num_infs+iso_pairs)/(veclength^2)
  
  answer <- c(num_ppl = as.integer(veclength),
              num_pairs = as.integer(posspairs),
              mean_kind = round(meankd, 2), 
              perc_discon_pairs = round(perc_uc*100, 2), 
              perc_iso_ppl = round(perc_iso*100, 2))
  return(answer)
  
}

##does kin distance within a group a returns a vector of kin distances with the infinities and zeros
##removed
kindist_within_vec <- function(idvec) {
  
  ##remove isolates and see percentage that are isolates
  vec2 <- as.character(idvec[!idvec %in% isos$id])
  
  ##now subset the kin distnaces matrix to get little one
  kd_1a <- kin_dists[vec2, vec2] ##makes subset of kin distances matrix- incl all pairs twice and diag of 0
  kd_1 <- kd_1a[lower.tri(kd_1a)] ##gets just the lower triangular part of the matrix 
  kd_2 <- kd_1[kd_1 != Inf] ##removes infinite values bc those are disconnected and now it's a vector
  
  
  return(kd_2)
  
}
##with infinities removed
##this will calculate the kin distance between two subgroups
kindist_btwn <- function(idvec1, idvec2) {
  ##takes in a vector of ids
  
  ##count how long it is
  veclength1 <- length(idvec1)
  veclength2 <- length(idvec2)
  posspairs <- veclength1*veclength2
  
  ##remove isolates and see percentage that are isolates
  noisos_1 <- as.character(idvec1[!idvec1 %in% isos$id])
  noisos_2 <- as.character(idvec2[!idvec2 %in% isos$id])
  
  #count number of isolates in each
  numisos_1 <- veclength1-length(noisos_1)
  numisos_2 <- veclength2-length(noisos_2)
  
  ##calculates percent of isolates in each subgroup of people
  perc_iso_1 <- numisos_1/veclength1
  perc_iso_2 <- numisos_2/veclength2
  
  ##calculates number of isolate pairs
  iso_pairs <- (numisos_1*veclength2) + (numisos_2*veclength1)
  
  ##now subset the kin distnaces matrix to get little one
  kd_1 <- kin_dists[noisos_1, noisos_2] ##makes subset of kin distances matrix
  kd_2 <- kd_1[kd_1 != Inf] ##removes infinite values bc those are disconnected and now it's a vector
  num_infs <- length(kd_1[kd_1==Inf]) ##finds out how many pairs had an infinite
  meankd <- mean(kd_2)
  
  ##percent of pairs that are unconnected
  perc_uc <- (num_infs+iso_pairs)/(posspairs)
  
  answer <- c(num_ppl_1 = as.integer(veclength1),
              num_ppl_2 = as.integer(veclength2),
              num_pairs = as.integer(posspairs),
              mean_kind = round(meankd, 2), 
              perc_discon_pairs = round(perc_uc*100, 2), 
              perc_iso_ppl_1 = round(perc_iso_1*100, 2),
              perc_iso_ppl_2 = round(perc_iso_2*100, 2))
  return(answer)
  
}
kindist_btwn_vec <- function(idvec1, idvec2) {
  
  ##remove isolates and see percentage that are isolates
  noisos_1 <- as.character(idvec1[!idvec1 %in% isos$id])
  noisos_2 <- as.character(idvec2[!idvec2 %in% isos$id])
  
  ##now subset the kin distnaces matrix to get little one
  kd_1 <- kin_dists[noisos_1, noisos_2] ##makes subset of kin distances matrix
  kd_2 <- kd_1[kd_1 != Inf] ##removes infinite values bc those are disconnected and now it's a vector
  
  return(kd_2)
  
}

##function to get immediate kin ties of a person
##with kinterlock colors -- this one is really nice for case studies and quick visualizations per-person
klg <- function(id, order) {
  id <- as.character(id)
  ego <- make_ego_graph(bbg2b, order= order, c(id))
  
  plot(ego[[1]],
       vertex.label.font = 1, 
       vertex.label.color = "black",
       vertex.label.family = "Helvetica",
       vertex.label.cex = .75,
       vertex.size = 15,
       edge.color = 'gray9',
       edge.arrow.size = .1,
       main.family = "Arial",
       vertex.shape = "circle",
       edge.label.font = 1, 
       edge.label.color = "darkgreen",
       edge.label.family = "Helvetica",
       edge.label.cex = .5,
       main = "Kinship Interlock Ego Graph",
       layout = layout_nicely(ego[[1]]))
}


##need to do minkdf by year--- will only work inside the klockyr function

min_kd_dfyr <- function(targetvec, popvec, isosvec, kin_distsyr) {
  ##remove isolates 
  noisos_1 <- as.character(targetvec[!targetvec %in% isosvec])
  noisos_2 <- as.character(popvec[!popvec %in% isosvec])
  
  kd_1 <- kin_distsyr[noisos_1, noisos_2] ##makes subset of kin distances matrix
  
  #transposing matrix and adding id column
  kd_2 <- kd_1 %>%
    t() %>%
    as.data.frame() %>%
    mutate(id = rownames(.)) 
  
  # Reshaping the data to long format
  answer <- kd_2 %>%
    pivot_longer(
      cols = -id, # Exclude the RowName column from the reshaping
      names_to = "ColumnName",
      values_to = "Value") %>%
    group_by(id) %>%
    summarize(
      MinValue = min(Value)
    ) %>%
    mutate(minimp = ifelse((MinValue==Inf | is.na(MinValue)), 100, MinValue)) %>%
    select(id, minimp)
  
  return(answer)
}

###making a function that makes klockgraph for a year
klyr <- function(year) {
  
  netyr <- year
  
  #filter rel list to just ones that start that year or before
  relyr <- r3 %>%
    na.omit() %>%
    filter(start <= netyr,
           #end >= netyr,
           et %in% rellist) %>%
    dplyr::select(p1, p2) 
  
  #make a graph
  gyr <- graph_from_edgelist(as.matrix(relyr),
                             directed = FALSE)
  
  ##let's save the distance matrix
  kin_distsyr <- distances(gyr)
  
  ##we need to know who the isolates are
  isosyr <- filter(ppl, !id %in% relyr$p1 & !id %in% relyr$p2)$id
  
  ##ok we need to know econ, pol, soc elites for this year
  socyr <- filter(socials_byyear, start <= netyr)$id
  polyr <- filter(politicals_byyear, start <= netyr)$id
  econyr <- filter(economics_byyear, start <= netyr)$id
  
  ##now getting minimum kin distances from these for all ppl
  polkd_yr <- min_kd_dfyr(polyr, ppl$id, isosyr,kin_distsyr)
  sockd_yr <- min_kd_dfyr(socyr, ppl$id, isosyr,kin_distsyr)
  econkd_yr <- min_kd_dfyr(econyr, ppl$id, isosyr,kin_distsyr)
  
  ##now putting it into a df
  kinters2yr <- ppl %>%
    select(id, byear, dyear) %>%
    filter(byear <= netyr,
           dyear >= netyr) %>%
    mutate(id = as.character(id),
           year = netyr,
           isoyear = ifelse(id %in% isosyr, 1, 0)) %>%
    left_join(polkd_yr, by = "id") %>%
    rename("politicals_kd" = "minimp") %>%
    left_join(sockd_yr, by = "id") %>%
    rename("socials_kd" = "minimp") %>%
    left_join(econkd_yr, by = "id") %>%
    rename("economics_kd" = "minimp") %>%
    mutate(econelite = ifelse(id %in% econyr, 1, 0),
           polelite = ifelse(id %in% polyr, 1, 0),
           socelite = ifelse(id %in% socyr, 1, 0),
           pol_kd = ifelse(is.na(politicals_kd), 100, politicals_kd),
           soc_kd = ifelse(is.na(socials_kd), 100, socials_kd),
           econ_kd = ifelse(is.na(economics_kd), 100, economics_kd),
           kinlock1 = ifelse((pol_kd <2 & econ_kd <2 & soc_kd<2), 1, 0),
           kinlock = ifelse(is.na(kinlock1), 0, kinlock1))%>%
    mutate(elitegroup = case_when(polelite==1&socelite==1&econelite==1 ~ "All",
                                  polelite==0&socelite==1&econelite==0 ~ "SuperSocial",
                                  polelite==0&socelite==0&econelite==1 ~ "Economic",
                                  polelite==1&socelite==0&econelite==0 ~ "Political",
                                  polelite==1&socelite==0&econelite==1 ~ "Political-Economic",
                                  polelite==0&socelite==1&econelite==1 ~ "Social-Economic",
                                  polelite==1&socelite==1&econelite==0 ~ "Social-Political",
                                  TRUE ~ "Regular")) %>%
    select(id, byear, dyear, year, isoyear, econelite, polelite, socelite, pol_kd, soc_kd, econ_kd,
           kinlock, elitegroup)
  
  
  return(kinters2yr)
}

###using the kinship interlocks data, this will tell you what criteria qualified a person as each type of elite
etype <- function(id) {
  cats <- list(
    Economics = econ_list,
    Politics  = pol_list,
    Socials   = soc_list
  )
  
  # for each category, find which sub‐vectors contain this id
  membership <- lapply(cats, function(subvecs) {
    names(subvecs)[sapply(subvecs, function(x) id %in% x)]
  })
  # drop any empty results
  membership <- membership[lengths(membership) > 0]
  
  # build the message
  if (length(membership) == 0) {
    cat(sprintf("ID %s is not found in Economics, Politics, or Socials.\n", id))
  } else {
    cat(sprintf("ID %s is found in:\n", id))
    for (catname in names(membership)) {
      subs <- membership[[catname]]
      cat(sprintf("  • %s: %s\n", catname, paste(subs, collapse = ", ")))
    }
  }
}


###for making elite type summary tables
summarise_elite <- function(subvecs, data) {
  imap_dfr(subvecs, ~ {
    df <- data %>% filter(id %in% .x)
    tibble(
      subtype    = .y,
      n          = nrow(df),
      pct_woman  = mean(df$woman,  na.rm = TRUE) * 100,
      pct_jewish = mean(df$jewish, na.rm = TRUE) * 100,
      pct_black  = mean(df$black,   na.rm = TRUE) * 100,
      pct_web = mean(df$web,   na.rm = TRUE) * 100,
      kinlock = mean(df$in_kinlock,   na.rm = TRUE) * 100,
      econelite = mean(df$econelite,   na.rm = TRUE) * 100,
      polelite = mean(df$polelite,   na.rm = TRUE) * 100,
      socelite = mean(df$socelite,   na.rm = TRUE) * 100,
      upperclass = mean(df$sr,   na.rm = TRUE) * 100,
      avgbyear = mean(df$byear, na.rm = TRUE),
      avgdyear = mean(df$dyear, na.rm = TRUE),
      neverdal = mean(df$neverdal, na.rm = TRUE),
    )
  })
}


##this will return a little df ready for left joining that has three columns: 
##person id, min kin distance from target pop, and the id of the person theyre closest to
min_kd_df <- function(targetvec, popvec, isos_ids = isos$id, kin_dists_mat = kin_dists,
                      isolate_value = Inf) {
  #ensure character IDs 
  targetvec <- as.character(targetvec)
  popvec    <- as.character(popvec)
  isos_ids  <- as.character(isos_ids)
  
  #remove isolates from both sides for the matrix lookup
  noisos_target <- targetvec[!targetvec %in% isos_ids]
  noisos_pop    <- popvec[!popvec %in% isos_ids]
  
  #population isolates (these are the ones we need rows for)
  pop_isolates <- popvec[popvec %in% isos_ids]
  
  #if there are no non-isolate targets, everyone’s distance is isolate_value
  if (length(noisos_target) == 0 || length(noisos_pop) == 0) {
    out <- tibble::tibble(
      id = popvec,
      MinValue = isolate_value,
      MinValueColumnName = NA_character_,
      CountMinValue = 0L
    )
    return(dplyr::arrange(out, id))
  }
  
  #subset distance matrix: rows = targets, cols = population
  kd_1 <- kin_dists_mat[noisos_target, noisos_pop, drop = FALSE]
  
  #want one row per population person
  kd_2 <- kd_1 %>%
    t() %>%
    as.data.frame() %>%
    tibble::rownames_to_column("id") %>%
    dplyr::mutate(id = as.character(id))
  
  #compute minimum distance to any target for each pop person
  answer <- kd_2 %>%
    tidyr::pivot_longer(
      cols = -id,
      names_to = "ColumnName",
      values_to = "Value"
    ) %>%
    dplyr::group_by(id) %>%
    dplyr::summarize(
      MinValue = min(Value, na.rm = TRUE),
      MinValueColumnName = ColumnName[which.min(Value)],
      CountMinValue = sum(Value == min(Value, na.rm = TRUE), na.rm = TRUE),
      .groups = "drop"
    )
  
  #add population isolates
  isolate_df <- tibble::tibble(
    id = pop_isolates,
    MinValue = isolate_value,
    MinValueColumnName = NA_character_,
    CountMinValue = 0L
  )
  
  #combine
  out <- dplyr::bind_rows(answer, isolate_df) %>%
    dplyr::distinct(id, .keep_all = TRUE) %>%
    dplyr::right_join(tibble::tibble(id = popvec), by = "id") %>%  # keep popvec universe
    dplyr::mutate(
      MinValue = dplyr::if_else(is.na(MinValue), isolate_value, MinValue),
      MinValueColumnName = dplyr::if_else(is.na(MinValueColumnName), NA_character_, MinValueColumnName),
      CountMinValue = dplyr::if_else(is.na(CountMinValue), 0L, as.integer(CountMinValue))
    ) %>%
    dplyr::arrange(id)
  
  return(out)
}


--------------------------------------------------------------------------------------------------------
  ## GETTING DATAFRAMES AND NETWORK READY in an all-purpose way!

###club and organization members
members <- members_raw %>%
  mutate(from = ifelse(!is.na(from_def), from_def, 
                       ifelse(!is.na(from_incl), from_incl, NA)),
         to = ifelse(!is.na(to_def), to_def, 
                     ifelse(!is.na(to_incl), to_incl, NA))) %>%
  select(id, org_id, from, to) %>%
  filter(!is.na(id))


##main individuals file
ppl <- ppl_raw %>%
  drop_na(id) %>%
  mutate(first = as.factor(first),
         last = as.factor(last),
         maiden = as.factor(maiden),
         deathage = dyear - byear,
         bstate = as.factor(birthstate),
         dstate = as.factor(deathstate),
         religion = as.factor(religion),
         jewish = ifelse(religion == "Jewish", 1, 0),
         deb = ifelse(woman == 0 | is.na(deb), 0, 1),
         dcc = sapply(X = id, FUN = is_member, groupid = 4883),
         idlewild = sapply(X = id, FUN = is_member, groupid = 5709),
         sc = sapply(X = id, FUN = is_member, groupid = 2425),
         ind1 = as.factor(gsub("-.*", "", industry_primary, ignore.case = TRUE)))


##getting social reg data organized
pplsr <- ppl %>%
  select(id, y86_rb_reg, y00_sr_reg, y02_sr_reg, y04_sr_reg,
         y06_sr_reg, y09_sr_reg, y12_sbb_reg, y14_sbb_reg,
         y25_bb_reg, y26_bb_reg, y27_bb_reg, y28_bb_reg, y29_bb_reg, y30_bb_reg,
         y31_bb_reg, y32_bb_reg, y33_bb_reg, y34_bb_reg, y35_bb_reg, y36_bb_reg, y37_bb_reg,
         y38_bb_reg, y39_bb_reg, y40_bb_reg, y41_sr_reg, y41_bb_reg, y42_bb_reg, y43_sr_reg,
         y53_sr_reg, y55_sr_reg, sd7273, sd88) %>%
  mutate(
    across(-id, ~ replace_na(as.numeric(.x), 0)),
    srsum = rowSums(across(-id))
  )


##getting basics from rel raw
rel <- rel_raw %>%
  drop_na(first1, first2) %>%
  distinct(p1, p2, edgetype) %>%
  na.omit() %>%
  mutate(p1_woman = find_gender(p1),
         edgetype = tolower(edgetype),
         edgetype = gsub("-", "", edgetype, ignore.case = TRUE),
         edgetype = gsub(" ", "", edgetype, ignore.case = TRUE),
         edgetype = gsub("_", "", edgetype, ignore.case = TRUE)) %>%
  filter(p1 != p2)


##adds fam and sr stuff to ppl. this code is clunky!!!!! oh well!!!!
ppl <- ppl %>%
  mutate(srsum = sapply(X = ppl$id, FUN = find_srsum),
         firstsryr = sapply(X = ppl$id, FUN = find_minsr),
         lastsryr = sapply(X = ppl$id, FUN = find_maxsr),
         dad = sapply(X = ppl$id, FUN = find_dad, simplify = "vector"),
         mom = sapply(X = ppl$id, FUN = find_mom, simplify = "vector"),
         sibs = sapply(X = ppl$id, FUN = find_sib),
         matsibs = sapply(X = ppl$id, FUN = find_sib_mat),
         patsibs = sapply(X = ppl$id, FUN = find_sib_pat),
         kids = sapply(X = ppl$id, FUN = find_kids),
         gparents = sapply(X = ppl$id, FUN = find_gparents),
         auntuncles = sapply(X = ppl$id, FUN = find_auntuncles),
         fcousins = sapply(X = ppl$id, FUN = find_cousinsf))


##adding sibs to rel file. these chunks are all basically calculating direct kin
##ties from the ties that are already in the edgelist. like- i don't want to 
##hand-code sibling ties if i have them both connected to the same parents already. 
###i will do this for sibs, first cousins, aunts/uncles, and grandparents
sibss <- ppl %>%
  filter(!is.na(sibs)) %>%
  dplyr::select(id, sibs) 

rel <- rel %>%
  distinct(p1, p2, edgetype) %>%
  na.omit() %>%
  filter(p1 != p2)

##adds siblings to rel file
for (i in 1:nrow(sibss)){
  rel <- rbind(rel, add_sibs(sibss[i, 1]))
}

#get df of ids and gparents
gpar <- ppl %>%
  filter(!is.na(gparents)) %>%
  dplyr::select(id, gparents) 

##adds gparents to rel file
for (i in 1:nrow(gpar)){
  rel <- rbind(rel, add_gparents(gpar[i, 1]))
}

#get df of ids and auntuncles
auntunc <- ppl %>%
  filter(!is.na(auntuncles)) %>%
  dplyr::select(id, auntuncles) 

##adds auntuncles to rel file
for (i in 1:nrow(auntunc)){
  rel <- rbind(rel, add_auntuncles(auntunc[i, 1]))
}

#gets df of ids and cousins and adds cousins to rel file
cousins <- ppl %>%
  filter(!is.na(fcousins)) %>%
  dplyr::select(id, fcousins) 

for (i in 1:nrow(cousins)){
  rel <- rbind(rel, add_cousins(cousins[i, 1]))
}

## gets rid of duplicates from sibs and adds gender and birthyear
rel <- rel %>%
  distinct(p1, p2, edgetype) %>%
  na.omit() %>%
  filter(p1 != p2) %>%
  mutate(p1_woman = find_gender(p1),
         p2_woman = find_gender(p2),
         p1_byear = find_birthyear(p1),
         p2_byear = find_birthyear(p2)) %>%
  mutate(p_lo = pmin(p1, p2), p_hi = pmax(p1, p2)) %>%
  distinct(p_lo, p_hi, edgetype, .keep_all = TRUE) %>%
  select(p1 = p_lo, p2 = p_hi, edgetype)

##spouses
spouses <- rel_raw %>%
  drop_na(p1, p2, first1, first2, edgetype) %>%
  distinct(p1, p2, edgetype) %>%
  filter(p1 != p2,
         edgetype == "Spouse") %>%
  mutate(edgetype = tolower(edgetype),
         edgetype = gsub("-", "", edgetype, ignore.case = TRUE),
         edgetype = gsub(" ", "", edgetype, ignore.case = TRUE),
         edgetype = gsub("_", "", edgetype, ignore.case = TRUE)) %>%
  filter(!is.na(p1) & !is.na(p2)) %>%
  mutate(p1_woman = find_gender(p1),
         p2_woman = find_gender(p2),
         p1_byear = find_birthyear(p1),
         p2_byear = find_birthyear(p2),
         mbyear = ifelse(p1_woman == 0, p1_byear, p2_byear),
         wbyear = ifelse(p1_woman == 1, p1_byear, p2_byear),
         manolderyrs = wbyear - mbyear,
         rel1 = find_religion(p1),
         rel2 = find_religion(p2),
         mrel = ifelse(p1_woman == 0, rel1, rel2),
         wrel = ifelse(p1_woman == 1, rel1, rel2))



##making simple matrix for network graph
relmatrix <- rel %>%
  dplyr::select(p1, p2) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  as.matrix()

#finding isolates list so i can put it in the graph
nonisos <- unique(c(rel$p1, rel$p2)) #all non isolates
isolates <- setdiff(ppl$id, nonisos)

##making network graph
bbg1 <- graph_from_edgelist(relmatrix,
                            directed = FALSE)
bbg <- add_vertices(bbg1, nv = length(isolates), name = isolates) ##adding isolates
deg_jbb <- igraph::degree(bbg)
eigen_jbb <- igraph::eigen_centrality(bbg)
bet_jbb <- igraph::betweenness(bbg)
close_jbb <- igraph::closeness(bbg)
coreness_jbb <- igraph::coreness(bbg)
constraint_jbb <- igraph::constraint(bbg)
eccentricity_jbb <- igraph::eccentricity(bbg)

##adding network data to ppl file
ppl <- ppl %>%
  mutate(cluster = as.factor(components(bbg)$membership[as.character(id)]))
ppl <- ppl %>%
  mutate(deg = ifelse(is.na(deg_jbb[as.character(id)]), 0, 
                      deg_jbb[as.character(id)]),
         eig = ifelse((is.na(eigen_jbb$vector[as.character(id)]) | cluster != 1), NA, 
                      eigen_jbb$vector[as.character(id)]),
         bet = ifelse((is.na(bet_jbb[as.character(id)]) | cluster != 1), NA,
                      bet_jbb[as.character(id)]),
         close = ifelse((is.na(close_jbb[as.character(id)]) | cluster != 1), NA,
                        close_jbb[as.character(id)]),
         kcore = ifelse((is.na(coreness_jbb[as.character(id)]) | cluster != 1), NA,
                        coreness_jbb[as.character(id)]),
         constraint = ifelse((is.na(constraint_jbb[as.character(id)]) | cluster != 1), NA,
                             constraint_jbb[as.character(id)]),
         eccentricity = ifelse((is.na(eccentricity_jbb[as.character(id)]) | cluster != 1), NA,
                               eccentricity_jbb[as.character(id)]))

##summarizing by cluster. the giant cluster is the "family web"
pplclusters <- ppl %>%
  group_by(cluster) %>%
  summarise(People = n(),
            Percent_Total = round(((People / length(ppl$id))* 100), 2),
            debs = round(mean(deb)*100, 2),
            jewish = round(mean(jewish)*100, 2))

##adds number of people in each person's cluster to ppl file
ppl <- ppl %>%
  mutate(num_in_clust = pull(pplclusters[match(ppl$cluster, pplclusters$cluster),
                                         which(colnames(pplclusters) == "People")]),
         srsum = sapply(id, find_srsum))


##df of just ppl listed in social register at some point
sr <- ppl %>%
  filter(srsum > 0,
         firstsryr < 1950)

##adding sr anc and desc
ppl <- ppl %>%
  mutate(num_sr_desc = sapply(X = ppl$id, FUN = find_nd_sr),
         num_sr_anc = sapply(X = ppl$id, FUN = find_na_sr))


##clusters for just social register people 
sr_clust <- sr %>%
  group_by(cluster) %>%
  summarise(People = n(), 
            Percent_Total = round(((People / length(sr$id))* 100), 2),
            numdebs = sum(deb),
            numidle = sum(idlewild),
            jewish = round(mean(jewish)*100, 2),
            core = mean(kcore),
            women = mean(woman))


##more sparse rel graph for visualizing sectors of web, other purposes. i don't
##really use this for analysis but it makes clearer ego graphs
rel_f <- rel_raw %>%
  drop_na(first1, first2) %>%
  distinct(p1, p2, edgetype) %>%
  na.omit() %>%
  mutate(p1_woman = find_gender(p1),
         edgetype = tolower(edgetype),
         edgetype = gsub("-", "", edgetype, ignore.case = TRUE),
         edgetype = gsub(" ", "", edgetype, ignore.case = TRUE),
         edgetype = gsub("_", "", edgetype, ignore.case = TRUE)) %>%
  filter(p1 != p2) %>%
  dplyr::select(p1, p2) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  as.matrix()

rel_few_labels <- rel_raw %>%
  drop_na(first1, first2) %>%
  distinct(p1, p2, edgetype) %>%
  na.omit() %>%
  mutate(p1_woman = find_gender(p1),
         edgetype = tolower(edgetype),
         edgetype = gsub("-", "", edgetype, ignore.case = TRUE),
         edgetype = gsub(" ", "", edgetype, ignore.case = TRUE),
         edgetype = gsub("_", "", edgetype, ignore.case = TRUE)) %>%
  filter(p1 != p2) %>%
  dplyr::select(p1, p2, edgetype) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2))

bbg2a <- graph_from_edgelist(rel_f,
                             directed = TRUE)
bbg2 <- add_vertices(bbg2a, nv = length(isolates), name = isolates) #

##adding in names and gender for making smaller rel graphs

ppl <- ppl%>%
  mutate(full = paste(first, last))

attrib_name <- ppl %>%
  select(id, full) %>%
  mutate(id = as.character(id))

attrib_woman <- ppl %>%
  select(id, woman) %>%
  mutate(id = as.character(id))

V(bbg2)$label <- as.character(attrib_name$full[match(V(bbg2)$name, 
                                                     attrib_name$id)])
V(bbg2)$woman <- as.character(attrib_woman$woman[match(V(bbg2)$name, 
                                                       attrib_woman$id)])
V(bbg2)$color <- ifelse(V(bbg2)$woman==0, "lightblue", "lightpink")

edge_attr(bbg2, "label") <- rel_few_labels$edgetype


##let's save the distance matrix
kin_dists <- distances(bbg)
##we need to know who the isolates are
isos <- filter(ppl, !id %in% rel$p1 & !id %in% rel$p2)


--------------------------------------------------------------------------------------------------
  ####~*~*~*~*KINSHIP INTERLOCKS: PREPPING THE THREE ELITE TYPES DATA ~*~*~*~*~*~*
  ###if you're wondering- what on earth are these categories? what do they mean?? please see appendix A
  ##and if you've still got questions, email me
  
##this is the dataframe i'll add the info to
kinters <- ppl


##spouses file to get spouses
spz <- rel_raw %>%
  drop_na(first1, first2, edgetype) %>%
  filter(p1 != p2,
         edgetype == "Spouse") %>%
  mutate(p1_woman = find_gender(p1),
         p2_woman = find_gender(p2),
         p1_byear = find_birthyear(p1),
         p2_byear = find_birthyear(p2),
         p1_dyear = find_deathyear(p1),
         p2_dyear = find_deathyear(p2),
         younger = pmax(p1_byear, p2_byear),
         startimp = as.numeric(ifelse(is.na(start), younger + 21, start))) %>%
  mutate(husband = ifelse(p1_woman == 0, p1, p2),
         wife = ifelse(p1_woman == 1, p1, p2)) %>%
  select(p1, p2, start, end, notes, startimp) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2))

##for joining
pplsimp <- ppl %>%
  select(id, dyear, woman, last, first, maiden, industry_primary, deathage) %>%
  mutate(id = as.character(id))

##VECTOR OF SOCIAL ELITES
debids <- filter(ppl, deb==1)$id
otherdebs <- filter(ppl, deb_elsewhere==1)$id
saduch <- members_raw %>% filter(org_id==5354)
cottonpalace <- members_raw %>% filter(org_id==1807)
gmardigras <- members_raw %>% filter(org_id==3958)
tylerrose <- members_raw %>% filter(org_id==6787)
jewishdebs <- filter(ppl, deb_jewish==1)$id
scids <- filter(ppl, sc==1)$id
idleids <- filter(ppl, idlewild==1)$id
bayouclub <- members_raw %>% filter(org_id==4924)
argyleclub <- members_raw %>% filter(org_id==6126)
assemblyclub <- members_raw %>% filter(org_id==9954)
yachtclub <- members_raw %>% filter(org_id==7047)
dcc_pres <- members_raw %>% ##country club presidents
  filter(org_id==6217 &pres==1)
jl_pres <- members_raw %>% ##junior league presidents
  filter(org_id==7956 &pres==1)
dwc_pres <- members_raw %>% ##dallas womens club presidents
  filter(org_id==6389 &pres==1)
terp_pres <- members_raw %>% ##terp club presidents
  filter(org_id==7722 &pres==1)
colcl_pres <- members_raw %>% ##columbian club presidents
  filter(org_id==7354 &pres==1)
sc_pres <- members_raw %>% ##shakespeare club presidents
  filter(org_id==2425 &pres==1)
idle_pres <- members_raw %>% ##idlewild club presidents
  filter(org_id==5709 &pres==1)

##literally aristocratic families in europe (and one from egypt)
aristocracy<- c("55103","57388","20683","45935","83061","28863","27585","98208","35887","44998","41547","72387",
                "81610")

##famous northeastern dynasties- van renss, whartons, roosevelts, vanderbilts, etc
dynasties <- c("42729","42551","37347","61445","65787","24537","49287","90859","42492","27108","80739","72952")

##miscstatus
#goodall wooten named "austin's most worthy citizen", ima hogg literally ima hogg, kk very legendary socialite,
#gb dealey literally himself, j howard ardrey having largest farewell dinner like ever then being NYC social elite
miscstatus <- c("61493","88417","53523","53978","83108")

#from domhoff 1967- who rules america p 34
domhoffschools <- c("3384","4518","6250","6304","1865","7507","7737","9443",
                    "9603","5512","3616","5625","5392","3939", "6769", "8025", "8084", 
                    "3495", "1049", "1603", "9299", "3603", "1440", "2726")
domschoolstud <- members_raw %>% filter(org_id%in%domhoffschools)

#from kavaler book- right schools starting p 266
kavalerschools <- c("4303","1595","9461","6001","9059","2683","9443","2262",
                    "3221","3475","7268","6200","6020","4400","1452","2110",
                    "9692","8448","1995","9061","1019","6408","4223", "2521",
                    "4433","2976","9280","9025","5750","9760","9443","7737",
                    "4518","4356","7112","6250","1865","5625","3616","4132",
                    "9603","3384","7507","5392","6304","8561","5512")
kavschoolstud <- members_raw %>% filter(org_id%in%kavalerschools)




##metropolitan club of nyc
metronyc <- members_raw %>% filter(org_id==8291)
#cosmopolitan club of nyc
cosmonyc <- members_raw %>% filter(org_id==6876)
##allegro club
allegros <- members_raw %>% filter(org_id==3560)
##artillery club of galveston
artillery <- members_raw %>% filter(org_id==4853)
##german club of san anbtonio
germanc <- members_raw %>% filter(org_id==8648)
##order of the alamo
alamo <- members_raw %>% filter(org_id==2647)##order of the alamo
##colony club
colonyc <- members_raw %>% filter(org_id==2238)


#linz award winners
linz <- na.omit(linz_raw$id)

socials <- unique(c(debids, otherdebs, scids, idleids, dcc_pres$id, saduch$id, cottonpalace$id, gmardigras$id,
                    tylerrose$id, metronyc$id, jl_pres$id, dwc_pres$id, allegros$id, na.omit(terp_pres$id),
                    colcl_pres$id, bayouclub$id, argyleclub$id, assemblyclub$id, aristocracy, dynasties, linz,
                    artillery$id, germanc$id, alamo$id, colonyc$id, cosmonyc$id, yachtclub$id, miscstatus,
                    domschoolstud$id, sc_pres$id, idle_pres$id))


##VECTOR OF ECONOMIC ELITES
##biggest fortunes, presidents of banks, insured for most $, etc

##most insured ppl in 29, 28, and paid out in 33 --- ALSO GET SPOUSES I THINK??
mostins <- c(ins29$id, ins28$id, ins33$id, ins30$id)

##millionaires from 1892 ny tribune list
##jc oconnor, sarah cockrell, wh gaston, john simpson, marsalis, slaughter, jb wilson,we hughes, alex and phil sanger
##and then ones in dataset but not dallas list
millions1892_d_all <- mil1892_02_raw %>% 
  filter(y1892==1,
         City=="Dallas")
millions1892_d <- millions1892_d_all$dallasID

millions1892_elsew_all <- mil1892_02_raw %>% 
  filter(y1892==1,
         !is.na(dallasID),
         City!="Dallas")
millions1892_elsew <- millions1892_elsew_all$dallasID


##getting 1892 millionaire spouses
###getting spouses
spz92 <- spz %>% filter(startimp < 1892, (is.na(end) | end >= 1892))
mil92_spdf <- pplsimp %>% 
  filter(id %in% millions1892_d | id %in% millions1892_elsew) %>%
  left_join(spz92, by = c("id" = "p1")) %>%
  left_join(spz92, by = c("id" = "p2")) %>%
  select(p1, p2)
mil92_sp <- c(mil92_spdf$p1, mil92_spdf$p2)

##millionaires from 1902 list
##belo jr., jc oconnor, alex sanger, phil sanger, cc slaughter, - and non dallas winfield scott, bertrand adoue
millions1902_d_all <- mil1892_02_raw %>% 
  filter(y1902==1,
         City=="Dallas")
millions1902_d  <- millions1902_d_all$dallasID

millions1902_elsew_all <- mil1892_02_raw %>% 
  filter(y1902==1,
         !is.na(dallasID),
         City!="Dallas")
millions1902_elsew <- millions1902_elsew_all$dallasID

###getting spouses
spz02 <- spz %>% filter(startimp < 1902, (is.na(end) | end >= 1902))
mil02_spdf <- pplsimp %>% 
  filter(id %in% millions1902_d | id %in% millions1902_elsew) %>%
  left_join(spz02, by = c("id" = "p1")) %>%
  left_join(spz02, by = c("id" = "p2")) %>%
  select(p1, p2)
mil02_sp <- c(mil02_spdf$p1, mil02_spdf$p2)


##forbes 1982 millionaires
#caroline rose hunt, lamar hunt, margaret hunt hill,  nelson bunker hunt,perry richardson bass,
#sid richardson bass, wm herbert hunt, hassie hunt, ww caruth jr, trammel crow, clint murchison jr,
#oveta culp hobby, barron hilton, leo f corrigan jr, ross perot
mil1982 <- c("56018","33924","21766","84124","21540","31354","11727","46247","46476","56617",
             "79368","70881","24732","35975","75543","72295","78851")
spz1982 <- spz %>% filter(startimp < 1982, (is.na(end) | end >= 1982))
mil1982_spdf <- pplsimp %>% 
  filter(id %in% mil1982) %>%
  left_join(spz1982, by = c("id" = "p1")) %>%
  left_join(spz1982, by = c("id" = "p2")) %>%
  select(p1, p2)
mil1982_sp <- c(mil1982_spdf$p1, mil1982_spdf$p2)



##tx monthly 1989 millionaires
txmil1989 <- tx100_89_raw$id[!is.na(tx100_89_raw$id)]

###getting spouses
spz89 <- spz %>% filter(startimp < 1989, (is.na(end) | end >= 1989))
mil89_spdf <- pplsimp %>% 
  filter(id %in% txmil1989) %>%
  left_join(spz89, by = c("id" = "p1")) %>%
  left_join(spz89, by = c("id" = "p2")) %>%
  select(p1, p2)
mil89_sp1 <- c(mil89_spdf$p1, mil89_spdf$p2)
mil89_sp <- mil89_sp1[!is.na(mil89_sp1)]


##misc millionaires
#ehr green, adolphus busch, j edgar pew, trammell crow, harry hays morgan jr and sr, frank crawford vanderbilt
#wt campbell, founder of texaco, lucy ball and edmund ball, ww caruth jr and mabel, clint john dabney and lucille murchison,
#jock mclean, thomas blake III, tom slick, anne windfohr, tw waggoner, electra waggoner, jake hamon, robert decherd
miscmill <- c("24537","36521","74734","56617","65784","94721","37347","91002", "73382","52010","28465","46476",
              "92198","79368","83841","43508", "97648","17227","70736","29030","41120","70881","16440","50620",
              "97958","23442","64739","21116","82498","62765","49287","42551","80184","26989","84540","17384",
              "95483","63142","44532","28934","84615","34268","92363","95368","62234","90859","71212","44853","79081","25087",
              "62826","68833","93960","14087","43167","33855","93537","10288","83341")

##fortune 1957 rich list
##hl hunt, sid richardson, bill blakley, clint murchison, leo corrigan, robert kleberg, algur hurtle meadows
fortune57 <- fortune57_raw$id[!is.na(fortune57_raw$id)]

###getting spouses
spz57 <- spz %>% filter(startimp < 1957, (is.na(end) | end >= 1957))
mil57_spdf <- pplsimp %>% 
  filter(id %in% fortune57) %>%
  left_join(spz57, by = c("id" = "p1")) %>%
  left_join(spz57, by = c("id" = "p2")) %>%
  select(p1, p2)
mil57_sp <- c(mil57_spdf$p1, mil57_spdf$p2)

##wealth census 1860 and 1870
medpp60 <- 1630*10
medrp60 <- 1826*10
w60 <- ppl %>%
  filter(!is.na(estate_pers1860) | !is.na(estate_real1860)) %>%
  mutate(estate_pers1860 = as.numeric(estate_pers1860),
         estate_pers1860 = replace_na(estate_pers1860, 0),
         estate_real1860 = replace_na(estate_real1860, 0)) %>%
  filter(estate_pers1860 > medpp60 | estate_real1860 > medrp60)
w60s <- w60$id
###getting spouses
spz60 <- spz %>% filter(startimp <= 1860, (is.na(end) | end >= 1860))
mil60_spdf <- pplsimp %>% 
  filter(id %in% w60s) %>%
  left_join(spz60, by = c("id" = "p1")) %>%
  left_join(spz60, by = c("id" = "p2")) %>%
  select(p1, p2)
mil60_sp <- c(mil60_spdf$p1, mil60_spdf$p2)

###land data from 1860
land1860 <- land1860_raw %>%
  mutate(cash2 = ifelse(is.na(cash), 0, cash),
         cent = ntile(cash2, 100))

med1860 <- median(land1860$cash2, na.rm=T) 
ttm_land1860 <- na.omit(filter(land1860, cash2 > (med1860*10))$id) ###identical to top 1 percent so we dont need to do that

land60_spdf <- pplsimp %>% 
  filter(id %in% ttm_land1860) %>%
  left_join(spz60, by = c("id" = "p1")) %>%
  left_join(spz60, by = c("id" = "p2")) %>%
  select(p1, p2)
land60_sp <- na.omit(c(land60_spdf$p1, land60_spdf$p2))


##1870 version
medpp70 <- (1630*.75)*10
medrp70 <- (1826*.75)*10
w70 <- ppl %>%
  filter(!is.na(estate_pers1870) | !is.na(estate_real1870)) %>%
  mutate(estate_pers1870 = replace_na(as.numeric(estate_pers1870), 0),
         estate_real1870 = replace_na(as.numeric(estate_real1870), 0)) %>%
  filter(estate_pers1870 > medpp70 | estate_real1870 > medrp70)
w70s <- w70$id
###getting spouses
spz70 <- spz %>% filter(startimp <= 1870, (is.na(end) | end >= 1870))
mil70_spdf <- pplsimp %>% 
  filter(id %in% w70s) %>%
  left_join(spz70, by = c("id" = "p1")) %>%
  left_join(spz70, by = c("id" = "p2")) %>%
  select(p1, p2)
mil70_sp <- c(mil70_spdf$p1, mil70_spdf$p2)


##getting 10x median from estate notices
### ok now working on incorporating all the estates data
ps <- ppl %>%
  select(id, dyear, woman, last, first, maiden, industry_primary, srsum, cluster) %>%
  mutate(id = as.character(id))
estates <- estates_raw %>%
  left_join(cpis, by = "year") %>%
  filter(!is.na(amount)) %>%
  mutate(id = as.character(id),
         amount = as.numeric(amount)) %>%
  left_join(ps, by = "id") %>%
  mutate(adj = (base_cpi/cpi)*amount)
ens_tent <- median(estates$adj)*10
tentimes_ens <- filter(estates, adj >= ens_tent)$id


##GET ESTATES OVERALL
allests <- ppl %>%
  select(id, first, last, maiden, dyear, estate) %>%
  mutate(estate = as.numeric(estate)) %>%
  filter(estate>0) %>%
  left_join(cpis, by = c("dyear"="year")) %>%
  mutate(adj = (base_cpi/cpi)*estate)
medallest_tent <- median(allests$adj, na.rm=T)*10
tentimes_allest <- filter(allests, adj >= medallest_tent)$id

##get spouses of ppl with big estates
e1 <- estates %>% filter(adj >= ens_tent) %>% select(id, year)
e3 <- allests %>% filter(adj >= medallest_tent) %>% select(id, dyear) %>% rename(year = dyear) %>% mutate(id = as.character(id))
spest <- e1 %>%
  add_row(e3) %>%
  left_join(spz, by = c("id" = "p1"),
            relationship = "many-to-many") %>%
  filter(startimp <= year, (is.na(end) | end >= year)) %>%
  select(id, year, p2) %>%
  rename(sp1 = p2)
est_sp1 <- spest$sp1
spest2 <- spest %>%
  left_join(spz, by = c("id" = "p2"),
            relationship = "many-to-many") %>%
  filter(startimp <= year, (is.na(end) | end >= year)) 
est_sp <- c(est_sp1, spest2$p1)


##now anyone living in a house worth more than 20k in 1940-- want both head of household
##and spouse

hvs40 <- ppl %>%
  select(id, first, last, maiden, byear, dyear, home_40) %>%
  filter(grepl("^-?\\d+\\.?\\d*$", home_40)) %>%
  mutate(hv40 = as.numeric(home_40)) %>%
  filter(hv40>0) %>%
  select(-home_40)%>%
  mutate(id = as.character(id))
hvs30 <- ppl %>%
  select(id, first, last, maiden, byear, dyear, home_30) %>%
  filter(grepl("^-?\\d+\\.?\\d*$", home_30)) %>%
  mutate(hv30 = as.numeric(home_30)) %>%
  select(-home_30) %>%
  mutate(id = as.character(id))


#now for ppl paying rlly high rent
rent40 <- ppl %>%
  select(id, first, last, maiden, byear, dyear, rent_40) %>%
  filter(grepl("^-?\\d+\\.?\\d*$", rent_40)) %>%
  mutate(r40 = as.numeric(rent_40)) %>%
  filter(r40>0) %>%
  select(-rent_40)%>%
  mutate(id = as.character(id))
rent30 <- ppl %>%
  select(id, first, last, maiden, byear, dyear, rent_30) %>%
  filter(grepl("^-?\\d+\\.?\\d*$", rent_30)) %>%
  mutate(r30 = as.numeric(rent_30)) %>%
  filter(r30>0) %>%
  select(-rent_30)%>%
  mutate(id = as.character(id))

###getting spouses
spz40 <- spz %>%
  filter(startimp < 1941,
         (is.na(end) | end >= 1941))

###getting spouses 1930
spz30 <- spz %>%
  filter(startimp < 1931,
         (is.na(end) | end >= 1931))



###merging spouses with hvs40 and mutating until i've got two columns- one with id 
##and the second with the home value of themself or their spouse in 1940
hv40c <- hvs40 %>%
  left_join(spz40, by = c("id" = "p1")) %>%
  left_join(spz40, by = c("id" = "p2")) %>%
  select(id, p1, p2, hv40) %>% # Optional: Select the columns if you want to exclude others
  pivot_longer(
    cols = c(id, p1, p2), # Columns to pivot into longer format
    names_to = "id2", # Name of the new column that will hold the column names of 'p1' and 'p2'
    values_to = "id_value2" # Name of the new column that will hold the values of 'p1' and 'p2'
  ) %>%
  select(-id2,  id = id_value2, hv40) %>%
  select(id, hv40) %>%
  na.omit(id) %>%
  group_by(id) %>%
  summarise(n = n(),
            maxhv40 = max(hv40),
            minhv40 = min(hv40),
            chv40 = max(hv40)) %>%
  select(id, hv40_comm = chv40) %>%
  mutate(id = as.character(id))


###now for rent
rent40c <- rent40 %>%
  left_join(spz40, by = c("id" = "p1")) %>%
  left_join(spz40, by = c("id" = "p2")) %>%
  select(id, p1, p2, r40) %>% # Optional: Select the columns if you want to exclude others
  pivot_longer(
    cols = c(id, p1, p2), # Columns to pivot into longer format
    names_to = "id2", # Name of the new column that will hold the column names of 'p1' and 'p2'
    values_to = "id_value2" # Name of the new column that will hold the values of 'p1' and 'p2'
  ) %>%
  select(-id2,  id = id_value2, r40) %>%
  select(id, r40) %>%
  na.omit(id) %>%
  group_by(id) %>%
  summarise(n = n(),
            maxr40 = max(r40),
            minr40 = min(r40),
            cr40 = max(r40)) %>%
  select(id, r40_comm = cr40) %>%
  mutate(id = as.character(id))



##and the second with the home value of themself or their spouse in 1940
hv30c <- hvs30 %>%
  left_join(spz30, by = c("id" = "p1")) %>%
  left_join(spz30, by = c("id" = "p2")) %>%
  select(id, p1, p2, hv30) %>% # Optional: Select the columns if you want to exclude others
  pivot_longer(
    cols = c(id, p1, p2), # Columns to pivot into longer format
    names_to = "id2", # Name of the new column that will hold the column names of 'p1' and 'p2'
    values_to = "id_value2" # Name of the new column that will hold the values of 'p1' and 'p2'
  ) %>%
  select(-id2,  id = id_value2, hv30) %>%
  select(id, hv30) %>%
  na.omit(id) %>%
  group_by(id) %>%
  summarise(n = n(),
            maxhv30 = max(hv30),
            minhv30 = min(hv30),
            chv30 = max(hv30)) %>%
  select(id, hv30_comm = chv30) %>%
  mutate(id = as.character(id))

##now for rent 1930
###merging spouses with hvs40 and mutating until i've got two columns- one with id 
##and the second with the home value of themself or their spouse in 1940
rent30c <- rent30 %>%
  left_join(spz30, by = c("id" = "p1")) %>%
  left_join(spz30, by = c("id" = "p2")) %>%
  select(id, p1, p2, r30) %>% # Optional: Select the columns if you want to exclude others
  pivot_longer(
    cols = c(id, p1, p2), # Columns to pivot into longer format
    names_to = "id2", # Name of the new column that will hold the column names of 'p1' and 'p2'
    values_to = "id_value2" # Name of the new column that will hold the values of 'p1' and 'p2'
  ) %>%
  select(-id2,  id = id_value2, r30) %>%
  select(id, r30) %>%
  na.omit(id) %>%
  group_by(id) %>%
  summarise(n = n(),
            maxr30 = max(r30),
            minr30 = min(r30),
            cr30 = max(r30)) %>%
  select(id, r30_comm = cr30) %>%
  mutate(id = as.character(id))

medhv40tt <- 16930 ##this is ten times USA median for 1940. US median 29380, TX median was $1693, dal
mansions40_df <- hv40c %>%
  filter(hv40_comm >= medhv40tt) 
mansions40 <- mansions40_df$id

##same for 1930 except by 10x median
medhv30tt <- 29980 ##this is ten times USA median for 1930. US med 4778, TX median was 2998, dallas med 4932
mansions30_df <- hv30c %>%
  filter(hv30_comm >= medhv30tt) 
mansions30 <- mansions30_df$id

medr40tt <- 270 ##this is ten times USA median for 1940.  TX median was $17. doing US bc it's stricter
bigrents40_df <- rent40c %>%
  filter(r40_comm >= medr40tt) 
bigrents40 <- bigrents40_df$id

medr30tt <- 346 ##this is ten times USA median for 1930.  
bigrents30_df <- rent30c %>%
  filter(r30_comm >= medr30tt) 
bigrents30 <- bigrents30_df$id


###people who inherited ten times median estate size for probate and estate separately
##not ten times median bequest
beq_e_tent <- bequests_en_raw %>%
  select(p1, p2, p1dyear, p2_orgid, edgetype, bequest, value_known, value, type_known, type, incomeonly, typeshare,
         notes1, p2woman, p2elder, p2lname, p2fname,
         orgname, notes) %>%
  filter(!is.na(bequest)) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  left_join(pplsimp, by = c("p2" = "id")) %>%
  left_join(cpis, by = c("p1dyear" = "year")) %>%
  mutate(val_adj = (base_cpi/cpi)*value) %>%
  group_by(p2) %>%
  summarise(suminherit = sum(val_adj)) %>%
  filter(suminherit >= ens_tent)
ebeqtent <- beq_e_tent$p2

beq_p_tent <- bequests_p_raw %>%
  select(p1, p2, p1dyear, p2_orgid, edgetype, bequest, value_known, value, type_known, type,incomeonly, typeshare,
         notes1, p2woman, p2elder, p2lname, p2fname,
         orgname, notes) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  filter(!is.na(bequest),
         p1dyear > 1894,
         p1dyear < 1946) %>%
  left_join(pplsimp, by = c("p2" = "id")) %>%
  left_join(cpis, by = c("p1dyear" = "year")) %>%
  mutate(val_adj = (base_cpi/cpi)*value) %>%
  group_by(p2) %>%
  summarise(suminherit = sum(val_adj)) %>%
  filter(suminherit >= ens_tent)
pbeqtent <- beq_p_tent$p2

##bank presidents
#em reardon, nathan adams, jw murchison sr and jr, tf murchison, fred florence, rl thornton,
#ben wooten, wh gaston, ej gannon 1, ej gannon 2, rh stewart 1 and 3, lang wharton
#ja pondrom, af hardie, eo tenison, royal a ferris, de waggoner, ll jester, jb adoue 1,
#gw riddle, jc oconnor, gn aldredge ii, ew rose, rushton ardrey, jd gillespie,
#charles l sanger, al slaughter, dan d rogers, we hughes, bm burgher, ha kahler,
#hd ardrey, sj hay, rc ayres, john cristler, wb munson 1, wt waggoner, WF ramsey

bankpres <- c("28187", "83140","75041","12093","70644","43065","39049","99378",
              "45312","56847","94598","30808","27843","62685","81470",
              "15439","68806","80335","29835","24148","81488","14260","96112",
              "80530","55096","12602","46373","84927","32752","68466","53935",
              "69222","43473","28550","92435","13439","21041","92530","92739",
              "27661","30890","46613","29030", "97104","12639","52603","28967",
              "47240","43277","43200","80580","62370","27319","93516","84185","27423",
              "51998","27645","91662","85299","95001","50077","11856","13770","81380",
              "41416","82498","75763","28717","99628","98399","52591","22548","51835","85299",
              "25208","43030","50077","98964","13252","24886","97693","59772","19102","56928",
              "82923","47211","94399","57421","14574","39623","85487","72966","75325",
              "33125","25683","88253","88296","60211","30079","95626","81205","49975","28841",
              "28502","68844","88718","35295","46822","36311","26009","87143","46613","81759",
              "42868","25538","68179","32489","18149","64926","72545","77854","30104","28162","14988",
              "75086","35951","32416","94661","48094","51792","46059","90382","25015","38219","95001",
              "60803","79099","34815","30094","55478","50392","80608")

#high up at federal reserve
fedreserve <- c("59178","22548","97104","50077","97693")

##presidents of dallas chamber of commerce

cocpres_df <- members_raw %>%
  filter(org_id==5309 &pres==1 & !is.na(id))
cocpres <- unique(cocpres_df$id)

#coc pres elsehwere
cocpres_ew <- c("97243", "61493","10629","12649","71822","52850")




econ <- c(mostins, millions1892_d,millions1892_elsew, mil92_sp, millions1902_d, 
          millions1902_elsew, mil02_sp, miscmill, fortune57, mil57_sp,
          tentimes_ens, est_sp,mil89_sp, txmil1989,
          tentimes_allest,mansions40, mansions30, w60s, mil60_sp, w70s, mil70_sp, ebeqtent, pbeqtent, bankpres,
          mil1982, mil1982_sp, cocpres, cocpres_ew, ttm_land1860, land60_sp, bigrents30, bigrents40)
economics <- unique(econ[!is.na(econ)])



##VECTOR OF POLITICAL ELITES
##MAYORS, CONGRESSMEN, JUDGES, COLONELS, GENERALS
#various dallas elected officials
mayors <- mayors_raw$id[!is.na(mayors_raw$id)]
pcmayors <- na.omit(pcmayors_raw$id)
sheriffs <- na.omit(sheriffs_raw$id)
daljudges <- na.omit(daljudges_raw$id)
uscongdal <- uscongdal_raw$id[!is.na(uscongdal_raw$id)]
schoolboard <- na.omit(schoolboard_raw$id)
dalgovatts<- na.omit(dalgovatts_raw$id)
txcongdal_h <- filter(txcongdal_raw, chamber=="H")$id #in texas house
txcongdal_s <- filter(txcongdal_raw, chamber=="S")$id #in texas senate
ussentx_dal <- na.omit(filter(ussentx_raw, liveddal==1)$id)
citycouncil <- citycouncil_raw$id[!is.na(citycouncil_raw$id)]#council, aldermen, commissioners all together

##pres of dallas bar association
presdalbar <- na.omit(presdalbar_raw$id)

##state level elected officials
ltgovs <- ltgovs_raw$id[!is.na(ltgovs_raw$id)]
txgovs_dal <- filter(txgovs_raw, lived_dallas==1)$id
txsc_dal <- na.omit(filter(txsc_raw, liveddal==1)$id)
rrcom <- na.omit(rrcom_raw$id)
utboard <- na.omit(utboard_raw$id)
utstudpres <- na.omit(utstudpres_raw$id)
utstudvp <- na.omit(utstudpres_raw$vpid) 
txsecstate <- na.omit(txsecstate_raw$id) 

##republican and democratic party executive committees
demcom<- members_raw %>% filter(org_id==1405)
repcom <- members_raw %>% filter(org_id==2317)
demcom_dal<- members_raw %>% filter(org_id==3893)
dnc <- members_raw %>% filter(org_id==8846)
rnc <- members_raw %>% filter(org_id==5795)

##politically powerful dallas orgs
##dallas citizens council
dccs <- members_raw %>% filter(org_id==4883)
citassoc <- members_raw %>% filter(org_id==7682)
cityplancom<- members_raw %>% filter(org_id==1276)
cityplan <- members_raw %>% filter(org_id==7076)
parkboard <- members_raw %>% filter(org_id==1906)
ccas <- members_raw %>% filter(org_id==1784)
prestannehill_df <- members_raw %>% filter(org_id==5614&pres==1&!is.na(id))
tannpres <- unique(prestannehill_df$id)

##misc city government officials
citygov <- members_raw %>% filter(org_id==4532)

##from other places than dallas- so not full universe
othermayors <- setdiff((filter(ppl, grepl("mayor", career, ignore.case = TRUE)))$id, mayors)
otherjudges <- setdiff((filter(ppl, grepl("judge", career, ignore.case = TRUE)))$id, daljudges)
othertxcong <- setdiff(filter(members_raw, org_id==6978)$id, c(txcongdal_h, txcongdal_s))
otherussentx <- na.omit(filter(ussentx_raw, liveddal==0)$id)
txgovs_ew <- na.omit(filter(txgovs_raw, lived_dallas==0)$id)
txsc_ew <- na.omit(filter(txsc_raw, liveddal==0)$id)
otherussen <- setdiff(filter(members_raw, org_id==8826& !is.na(id))$id, c(otherussentx, ussentx_dal))
otheruscong <- setdiff(filter(members_raw, org_id==2955& !is.na(id))$id, uscongdal)
##pres of texas bar association
presoftxbar_df <- members_raw %>%
  filter(org_id==5341 &pres==1 & !is.na(id))
barprestx <- unique(presoftxbar_df$id)
#pres of american bar association
barpresam <- c("52577","56085")
#pres of other bar associations - seaborn eastland of houston bar
barpresother <- c("18040")
#texas highway commission
txhighway <- members_raw %>%
  filter(org_id==6387& !is.na(id))

##special govt attorneys outside of dallas
othergovatts <- setdiff(c("28967","79247", "59431","29989","25578","88613","59419","35171","34121","21935","91246","39669","27397",
                          "23346","71765","39669","33650","48757","64447","27108","50337","12323","93923","93732","78529"), dalgovatts)

##based on job title
ambassadors <- c((filter(ppl, grepl("ambassador", career, ignore.case = TRUE)))$id,
                 (filter(ppl, grepl("diplomat", career, ignore.case = TRUE)))$id)
postmasters <- (filter(ppl, grepl("postmaster", career, ignore.case = TRUE)))$id #postmasters were a bigger deal then
miscpoliticians <- c((filter(ppl, grepl("politician", career, ignore.case = TRUE)))$id,
                     "29755","75689","59024","61184","62812","16737","15309","64919","50689","62845","88828",
                     "45444","18727","98442","77122","39060","11156","67701","42492","38420","35246",
                     "56614","10629","19008","38304")


activists <- c("10929","51897","17384","74650","31762","50617","24850","64444","68607","59179","73285","76626","23988",
               "51897","53445","82190","59179","13610","79368","17384","26989","11870","48896", "98924",
               "13610","64900","81072","75390","78630","32088")
powerbrokers <- c("17090","99671","19258","87167","36665","97061","32794","18890",
                  "68391","88107","80530","91913","67032","47268","85694","31227","92015","83341","96656") ##behind the scenes calling presidents and stuff


spies <- c("20041", "94624","89639") #fbi and cia lol
miscbureaucrats <- c("10629","34068","64739","14574","88296","56523","62180",
                     "70318","79309") ##Incl rich ppl given government posts at fed level like ej gould

##military
#only ones that i know were at least colonels
military <- c("36964","25980","46044","22585","44241","75060","61266","20230","20230","96928","26589",
              "90278","59764","95643","39915","80826","81590","66394","97423","48909","49546","94967","96975","82303",
              "55547","54386","79182","83052","60519","42577","30516","92586","43338","17469","99159","62087","36051",
              "18490","30213","94916","62845","91067","80957","36702","55103","28081","91922","89639",
              "89397","10723","95300","14953","21583","15823","90297","45710","88971","59458","80917","52450","41369",
              "68549","49287","60819","53788","67846","88296","65295","51891","95726", "56062","92711","43386",
              "39669","26142","31460","39669","27489","24732","28935","87729","30588","41526","53800","76271","48164","98973",
              "33392","82772","83842")
navalacad <- members_raw %>% filter(org_id==2479)
westpoint <- members_raw %>% filter(org_id==4665)
citadel <- members_raw %>% filter(org_id==5356)



politicals_1 <- unique(c(mayors, uscongdal, txcongdal_h, txcongdal_s, ussentx_dal, otherussentx,
                         txsc_dal, txsc_ew,txgovs_dal, txgovs_ew,otherussen,
                         otheruscong, ambassadors, military, otherjudges, dccs$id, ccas$id, txhighway$id, 
                         presdalbar,dalgovatts, barpresother,
                         sheriffs, citycouncil, citygov$id, navalacad$id, westpoint$id, 
                         citadel$id, othermayors, othertxcong, sheriffs, daljudges, parkboard$id,
                         barprestx,schoolboard, citassoc$id, postmasters, barpresam, txsecstate, 
                         utstudpres, utstudvp,rrcom,cityplancom$id, cityplan$id, spies, miscbureaucrats, 
                         powerbrokers, tannpres, ltgovs, demcom$id, repcom$id, demcom_dal$id, dnc$id, rnc$id, othergovatts, utboard))
##catching remaining politicians
miscpoliticals <- setdiff(miscpoliticians, politicals_1)

#putting it all together
politicals <- c(politicals_1, miscpoliticals)


##getting rid of the ones that i removed from the replication data due to them maybe being alive
politicals <- politicals[politicals %in% ppl$id]
socials <- socials[socials %in% ppl$id]
economics <- economics[economics %in% ppl$id]




##for the function that tells me what elite type people are and what criteria qualified them for it

econ_list <- list(
  mostins               = mostins,
  millions1892_d         = millions1892_d,
  millions1892_elsew     = millions1892_elsew,
  mil92_sp               = mil92_sp,
  millions1902_d         = millions1902_d,
  millions1902_elsew     = millions1902_elsew,
  mil02_sp               = mil02_sp,
  miscmill               = miscmill,
  fortune57              = fortune57,
  mil57_sp               = mil57_sp,
  tentimes_ens           = tentimes_ens,
  est_sp                 = est_sp,
  mil89_sp               = mil89_sp,
  txmil1989              = txmil1989,
  tentimes_allest        = tentimes_allest,
  mansions40             = mansions40,
  mansions30             = mansions30,
  bigrents30             = bigrents30,
  bigrents40             = bigrents40,
  w60s                   = w60s,
  mil60_sp               = mil60_sp,
  w70s                   = w70s,
  mil70_sp               = mil70_sp,
  ebeqtent               = ebeqtent,
  pbeqtent               = pbeqtent,
  bankpres               = bankpres,
  mil1982                = mil1982,
  mil1982_sp             = mil1982_sp,
  cocpres                = cocpres,
  cocpres_ew             = cocpres_ew,
  ttm_land1860           = ttm_land1860,
  land60_sp              = land60_sp
)

pol_list <- list(
  mayors                 = mayors,
  uscongdal              = uscongdal,
  otheruscong            = otheruscong,
  ussentx_dal            = ussentx_dal,
  otherussentx           = otherussentx,
  otherussen             = otherussen,
  txgovs_dal             = txgovs_dal,
  txgovs_ew              = txgovs_ew,
  miscpoliticals         = miscpoliticals,
  ambassadors            = ambassadors,
  military               = military,
  otherjudges            = otherjudges,
  dccs                   = dccs$id,
  ccas                   = ccas$id,
  ltgovs                 = ltgovs,
  txhighway              = txhighway$id,
  presdalbar             = presdalbar,
  dalgovatts             = dalgovatts,
  sheriffs               = sheriffs,
  citycouncil            = citycouncil,
  citygov                = citygov$id,
  navalacad              = navalacad$id,
  westpoint              = westpoint$id,
  citadel                = citadel$id,
  othermayors            = othermayors,
  daljudges              = daljudges,
  demcom                 = demcom$id,
  demcom_dal             = demcom_dal$id,
  repcom                 = repcom$id,
  dnc                    = dnc$id,
  rnc                    = rnc$id,
  othergovatts           = othergovatts,
  txcongdal_h            = txcongdal_h,
  txcongdal_s            = txcongdal_s,
  othertxcong            = othertxcong,
  cityplan               = cityplan$id,
  parkboard              = parkboard$id,
  utboard                = utboard,
  barprestx              = barprestx,
  barpresother           = barpresother,
  schoolboard            = schoolboard,
  citassoc               = citassoc$id,
  postmasters            = postmasters,
  barpresam              = barpresam,
  txsc_dal               = txsc_dal,
  txsc_ew                = txsc_ew,
  txsecstate             = txsecstate,
  utstudpres             = utstudpres,
  utstudvp               = utstudvp,
  rrcom                  = rrcom,
  cityplancom            = cityplancom$id,
  spies                  = spies,
  miscbureaucrats        = miscbureaucrats,
  powerbrokers           = powerbrokers,
  tannpres               = tannpres
)

soc_list <- list(
  debids                 = debids,
  otherdebs              = otherdebs,
  scids                  = scids,
  idleids                = idleids,
  dcc_pres               = dcc_pres$id,
  saduch                 = saduch$id,
  cottonpalace           = cottonpalace$id,
  gmardigras             = gmardigras$id,
  tylerrose              = tylerrose$id,
  metronyc               = metronyc$id,
  jl_pres                = jl_pres$id,
  dwc_pres               = dwc_pres$id,
  allegros               = allegros$id,
  terp_pres              = na.omit(terp_pres$id),
  colcl_pres             = colcl_pres$id,
  bayouclub              = bayouclub$id,
  argyleclub             = argyleclub$id,
  assemblyclub           = assemblyclub$id,
  aristocracy            = aristocracy,
  dynasties              = dynasties,
  linz                   = linz,
  artillery              = artillery$id,
  germanc                = germanc$id,
  alamo                  = alamo$id,
  colonyc                = colonyc$id,
  cosmonyc               = cosmonyc$id,
  yachtclub              = yachtclub$id,
  miscstatus             = miscstatus,
  domhoffschools         = domschoolstud$id,
  sc_pres                = sc_pres$id,
  idle_pres              = idle_pres$id
)





################WHO IS IN A KINSHIP INTERLOCK???????????###########

##saving kin distances 
politicals_kd <- min_kd_df(unique(politicals), ppl$id) %>%
  dplyr::rename(politicals_kd = MinValue,
                politicals_closest = MinValueColumnName,
                politicals_ties = CountMinValue)

economics_kd <- min_kd_df(unique(economics), ppl$id) %>%
  dplyr::rename(economics_kd = MinValue,
                economics_closest = MinValueColumnName,
                economics_ties = CountMinValue)

socials_kd <- min_kd_df(unique(socials), ppl$id) %>%
  dplyr::rename(socials_kd = MinValue,
                socials_closest = MinValueColumnName,
                socials_ties = CountMinValue)

kinters2 <- ppl %>%
  dplyr::select(id, full, woman, black, last, first, middle, maiden, byear, dyear,
                cluster, industry_primary, kcore, religion, denomination,
                srsum, firstsryr, jewish, arrive_dallas, left_dallas, wikidataID) %>%
  dplyr::mutate(id = as.character(id)) %>%
  dplyr::left_join(politicals_kd, by = "id") %>%
  dplyr::left_join(socials_kd,    by = "id") %>%
  dplyr::left_join(economics_kd,  by = "id") %>%
  dplyr::mutate(
    econelite = as.integer(id %in% economics),
    polelite  = as.integer(id %in% politicals),
    socelite  = as.integer(id %in% socials),
    elitegroup = dplyr::case_when(
      polelite==1 & socelite==1 & econelite==1 ~ "All",
      polelite==0 & socelite==1 & econelite==0 ~ "SuperSocial",
      polelite==0 & socelite==0 & econelite==1 ~ "Economic",
      polelite==1 & socelite==0 & econelite==0 ~ "Political",
      polelite==1 & socelite==0 & econelite==1 ~ "Political-Economic",
      polelite==0 & socelite==1 & econelite==1 ~ "Social-Economic",
      polelite==1 & socelite==1 & econelite==0 ~ "Social-Political",
      TRUE ~ "Regular"
    ),
    black = tidyr::replace_na(black, 0),
    web = dplyr::if_else(cluster == 1, 1L, 0L)
  )

##simple dataframe of ones who were in kinship interlock
kinterlocks <- kinters2 %>%
  dplyr::filter(politicals_kd < 2,
                economics_kd  < 2,
                socials_kd    < 2)



##adding some color stuff  for visualizations
kinters2 <- kinters2 %>%
         mutate(eltype_fill = case_when(elitegroup=="All" ~ "white",
                                 elitegroup=="Economic" ~ "darkgoldenrod1",
                                 elitegroup=="SuperSocial" ~ "dodgerblue2",
                                 elitegroup=="Political" ~ "red2",
                                 elitegroup == "Political-Economic" ~ "darkorange1",
                                 elitegroup== "Social-Economic" ~ "springgreen3",
                                 elitegroup== "Social-Political" ~ "darkviolet",
                                 srsum>0 ~ "slategray1",
                                 arrive_dallas=="never" ~ "slategray",
                                 TRUE ~ "gray84"),
                eltype_fill2 = case_when(elitegroup=="All" ~ "#FFFFFF",
                                  elitegroup=="Economic" ~ "#ffb90f",
                                  elitegroup=="SuperSocial" ~ 	"#1c86ee",
                                  elitegroup=="Political" ~ "#ee0000",
                                  elitegroup == "Political-Economic" ~ "#ff7f00",
                                  elitegroup== "Social-Economic" ~ "#00cd66",
                                  elitegroup== "Social-Political" ~ "#9400d3",
                                  srsum>0 ~ "#0E2D4B",
                                  arrive_dallas=="never" ~ "#3C3E40",
                                  TRUE ~ "#585A5C"),
                eltype_frame = case_when(id %in% kinterlocks$id ~ "black",
                                  TRUE ~ "gray84"))



bbg2b1 <- graph_from_edgelist(rel_f,
                              directed = TRUE)
bbg2b <- add_vertices(bbg2b1, nv = length(isolates), name = isolates)

##adding in names and gender for making sparser rel graphs

ppl <- ppl%>%
  mutate(full = paste(first, last))

attrib_fill <- kinters2 %>%
  select(id, eltype_fill) %>%
  mutate(id = as.character(id))

attrib_frame <- kinters2 %>%
  select(id, eltype_frame) %>%
  mutate(id = as.character(id))

V(bbg2b)$label <- as.character(attrib_name$full[match(V(bbg2b)$name, 
                                                      attrib_name$id)])
V(bbg2b)$color <- as.character(attrib_fill$eltype_fill[match(V(bbg2b)$name, 
                                                             attrib_fill$id)])
V(bbg2b)$frame.color <- as.character(attrib_frame$eltype_frame[match(V(bbg2b)$name, 
                                                                     attrib_frame$id)])

edge_attr(bbg2b, "label") <- rel_few_labels$edgetype

#i don't know why i gave this a new name but here we are. more useful variables
kinters3 <- kinters2 %>%
  mutate(sr = ifelse(srsum>1, 1, 0),
         in_kinlock = ifelse(id %in% kinterlocks$id, 1, 0),
         neverdal = case_when(arrive_dallas == "never" ~ 1,
                              TRUE ~ 0))

####### TYPES OF ELITES
###now some code that can summarize info about elites as in the appendix 
elite_lists <- list(
  Economics = econ_list,
  Politics  = pol_list,
  Socials   = soc_list
)


##making summary tables including an all row
summary_tables <- map(elite_lists, function(subvecs) {
  # per‑subtype summary
  sum_tab <- summarise_elite(subvecs, kinters3)
  
  #"all" summary
  all_ids <- unique(unlist(subvecs))
  df_all  <- kinters3 %>% filter(id %in% all_ids)
  all_row <- tibble(
    subtype    = "all",
    n          = nrow(df_all),
    pct_woman  = mean(df_all$woman,  na.rm = TRUE) * 100,
    pct_jewish = mean(df_all$jewish, na.rm = TRUE) * 100,
    pct_black  = mean(df_all$black,   na.rm = TRUE) * 100,
    pct_web    = mean(df_all$web,     na.rm = TRUE) * 100,
    kinlock    = mean(df_all$in_kinlock, na.rm = TRUE) * 100,
    econelite  = mean(df_all$econelite,   na.rm = TRUE) * 100,
    polelite   = mean(df_all$polelite,    na.rm = TRUE) * 100,
    socelite   = mean(df_all$socelite,    na.rm = TRUE) * 100,
    upperclass = mean(df_all$sr,      na.rm = TRUE) * 100,
    avgbyear   = mean(df_all$byear,   na.rm = TRUE),
    avgdyear   = mean(df_all$dyear,   na.rm = TRUE),
    neverdal   = mean(df_all$neverdal, na.rm = TRUE)
  )
  
  # "all_dallas" summary (neverdal == 0)
  df_dallas <- df_all %>% filter(neverdal == 0)
  all_dallas_row <- tibble(
    subtype    = "all_dallas",
    n          = nrow(df_dallas),
    pct_woman  = mean(df_dallas$woman,  na.rm = TRUE) * 100,
    pct_jewish = mean(df_dallas$jewish, na.rm = TRUE) * 100,
    pct_black  = mean(df_dallas$black,   na.rm = TRUE) * 100,
    pct_web    = mean(df_dallas$web,     na.rm = TRUE) * 100,
    kinlock    = mean(df_dallas$in_kinlock, na.rm = TRUE) * 100,
    econelite  = mean(df_dallas$econelite,   na.rm = TRUE) * 100,
    polelite   = mean(df_dallas$polelite,    na.rm = TRUE) * 100,
    socelite   = mean(df_dallas$socelite,    na.rm = TRUE) * 100,
    upperclass = mean(df_dallas$sr,      na.rm = TRUE) * 100,
    avgbyear   = mean(df_dallas$byear,   na.rm = TRUE),
    avgdyear   = mean(df_dallas$dyear,   na.rm = TRUE),
    neverdal   = mean(df_dallas$neverdal, na.rm = TRUE)
  )
  
  # bind them all together
  bind_rows(sum_tab, all_row, all_dallas_row)
})


econ_summary <- summary_tables$Economics
pol_summary  <- summary_tables$Politics
soc_summary  <- summary_tables$Socials




######making the networks of specific elite types
###NOTE: to make the plots in the paper itself, I used the website cosmograph.app, which is much better 
#than igraph at plotting big networks. I built the graphs, screenshotted them, put them into
#photoshop, removed the dark blue background, and then put them into illustrator where I added 
#the legends. so this code will construct the same graph objects but the visualizations
#will look quite different (kinda illegible tbh). highly recommend cosmograph! it's awesome!!

##first- political elites
relpol <- rel %>%
  filter(p1 %in% politicals,
         p2 %in% politicals)

relmatrix_pol <- relpol %>%
  dplyr::select(p1, p2) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  as.matrix()


##poltiical isolates
isos_pol <- setdiff(politicals, c(relpol$p1, relpol$p2))

gpol1 <- graph_from_edgelist(relmatrix_pol,
                             directed = FALSE)
gpol <- add_vertices(gpol1, nv = length(isos_pol), name = isos_pol)


##finds maximum component size 
max(components(gpol)$csize)

##finds percent isolates
length(isos_pol) / length(politicals)

V(gpol)$label <- as.character(attrib_name$full[match(V(gpol)$name, 
                                                     attrib_name$id)])
V(gpol)$color <- as.character(attrib_fill$eltype_fill[match(V(gpol)$name, 
                                                            attrib_fill$id)])
V(gpol)$frame.color <- as.character(attrib_frame$eltype_frame[match(V(gpol)$name, 
                                                                    attrib_frame$id)])

plot(gpol,
     vertex.label = NA,
     vertex.size = 2,
     edge.color = 'gray9',
     edge.arrow.size = .1,
     main.family = "Arial",
     vertex.shape = "circle",
     edge.label.font = 1, 
     edge.label.color = "darkgreen",
     edge.label.family = "Helvetica",
     edge.label.cex = .5,
     main = "Political lineages",
     layout = layout_nicely(gpol))


##now- social elites

relsoc <- rel %>%
  filter(p1 %in% socials,
         p2 %in% socials)

relmatrix_soc <- relsoc %>%
  dplyr::select(p1, p2) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  as.matrix()


##economic isolates
isos_soc <- setdiff(socials, c(relsoc$p1, relsoc$p2))

gsoc1 <- graph_from_edgelist(relmatrix_soc,
                             directed = FALSE)
gsoc <- add_vertices(gsoc1, nv = length(isos_soc), name = isos_soc)



##finds maximum component size 
max(components(gsoc)$csize)

##finds percent isolates
length(isos_soc) / length(socials)




V(gsoc)$label <- as.character(attrib_name$full[match(V(gsoc)$name, 
                                                     attrib_name$id)])
V(gsoc)$color <- as.character(attrib_fill$eltype_fill[match(V(gsoc)$name, 
                                                            attrib_fill$id)])
V(gsoc)$frame.color <- as.character(attrib_frame$eltype_frame[match(V(gsoc)$name, 
                                                                    attrib_frame$id)])

plot(gsoc,
     vertex.label = NA,
     vertex.size = 2,
     edge.color = 'gray9',
     edge.arrow.size = .1,
     main.family = "Arial",
     vertex.shape = "circle",
     edge.label.font = 1, 
     edge.label.color = "darkgreen",
     edge.label.family = "Helvetica",
     edge.label.cex = .5,
     main = "Socialite lineages",
     layout = layout_nicely(gsoc))


##now- econ elites

relecon <- rel %>%
  filter(p1 %in% economics,
         p2 %in% economics)

relmatrix_econ <- relecon %>%
  dplyr::select(p1, p2) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  as.matrix()

##economic isolates
isos_econ <- setdiff(economics, c(relecon$p1, relecon$p2))

gecon1 <- graph_from_edgelist(relmatrix_econ,
                              directed = FALSE)
gecon <- add_vertices(gecon1, nv = length(isos_econ), name = isos_econ)



##finds maximum component size 
max(components(gecon)$csize)

##finds percent isolates
length(isos_econ) / length(economics)

V(gecon)$label <- as.character(attrib_name$full[match(V(gecon)$name, 
                                                      attrib_name$id)])
V(gecon)$color <- as.character(attrib_fill$eltype_fill[match(V(gecon)$name, 
                                                             attrib_fill$id)])
V(gecon)$frame.color <- as.character(attrib_frame$eltype_frame[match(V(gecon)$name, 
                                                                     attrib_frame$id)])

plot(gecon,
     vertex.label = NA,
     vertex.size = 3,
     edge.color = 'gray9',
     edge.arrow.size = .1,
     main.family = "Arial",
     vertex.shape = "circle",
     edge.label.font = 1, 
     edge.label.color = "darkgreen",
     edge.label.family = "Helvetica",
     edge.label.cex = .5,
     main = "Millionaire lineages",
     layout = layout_nicely(gecon))


########################PROPEL SECTION################################

###now making the stacked bar charts for the propel section
##i made all of these look nicer in illustrator afterward


###filtering to the subtypes i want
pol_df <- pol_summary %>%
  filter(subtype %in% c("schoolboard", "citycouncil", "dalgovatts", "mayors",
                        "txcongdal_h","txcongdal_s", "ltgovs", "txgovs_dal",
                        "uscongdal","ussentx_dal",
                        "presdalbar","barprestx")) %>%
  mutate(subtype = factor(subtype,
                          levels = c("schoolboard", "citycouncil", "dalgovatts", "mayors",
                                     "txcongdal_h","txcongdal_s", "ltgovs", "txgovs_dal",
                                     "uscongdal","ussentx_dal",
                                     "presdalbar","barprestx")))

#plot
ggplot(pol_df, aes(x = subtype)) +
  # full-length background bar
  geom_col(aes(y = 100), fill ="white",color="black") +
  # actual kinlock overlay
  geom_col(aes(y = kinlock), fill = "grey40",color="black") +
  # y-axis scale
  scale_y_continuous(
    labels = label_number(accuracy = 1, suffix = "%"),
    limits = c(0, 100)  
  ) +
  labs(
    x     = NULL,
    y     = "% In Kinlock",
    title = "Kinlock Percentage by Political Subtype"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x       = element_text(angle = 25, hjust = 1),
    panel.grid.major.x = element_blank()
  )


### now for social elites
soc_df <- soc_summary %>%
  filter(subtype %in% c("idleids", "idle_pres", "scids","sc_pres","debids","saduch")) %>%
  mutate(subtype = factor(subtype,
                          levels = c("idleids", "idle_pres", "scids","sc_pres","debids","saduch")))

#plot
ggplot(soc_df, aes(x = subtype)) +
  # full-length background bar
  geom_col(aes(y = 100), fill ="white",color="black") +
  # actual kinlock overlay
  geom_col(aes(y = kinlock), fill = "grey40",color="black") +
  # y-axis scale
  scale_y_continuous(
    labels = label_number(accuracy = 1, suffix = "%"),
    limits = c(0, 100)  
  ) +
  labs(
    x     = NULL,
    y     = "% In Kinlock",
    title = "Kinlock Percentage by Social Elite Subtype"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x       = element_text(angle = 25, hjust = 1),
    panel.grid.major.x = element_blank()
  )


###ok now for the econ elites- a bit different
###showing that the more money you had, the more likely you were to be in kinship interlocks
estates_plotdf <- estates %>%
  mutate(kinlock = case_when(id %in% kinterlocks$id ~ 1,
                             TRUE ~ 0))


decile_df <- estates_plotdf %>%
  #filter(source=="estate notice") %>%
  mutate(decile = ntile(adj, 10)) %>%           # assign 1–10
  group_by(decile) %>%
  summarise(
    kinlock_pct = mean(kinlock, na.rm = TRUE) * 100
  ) %>%
  ungroup() %>%
  # make a factor so bars are in order
  mutate(
    decile = factor(decile, levels = 1:10, labels = paste0("D", 1:10))
  )

# 2. Plot
ggplot(decile_df, aes(x = decile)) +
  # background 0–100 bar
  geom_col(aes(y = 100),
           fill  = "white",
           color = "black",
           width = 0.6) +
  # actual kinlock overlay
  geom_col(aes(y = kinlock_pct),
           fill  = "grey40",
           color = "black",
           width = 0.6) +
  # labels on the dark portion
  # y axis from 0 to ~110% (for label room)
  scale_y_continuous(
    limits = c(0, 110),
    expand = expansion(mult = c(0, 0)),
    labels = label_number(suffix = "%", accuracy = 1)
  ) +
  labs(
    x     = "Estate size decile",
    y     = "% in Kinlock",
    title = "Kinlock Percentage by Estate Size Decile"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x        = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank()
  )


hv40_plotdf <- hvs40 %>%
  mutate(kinlock = case_when(id %in% kinterlocks$id ~ 1,
                             TRUE ~ 0),
         sr = case_when(id %in% sr$id ~ 1,
                        TRUE ~ 0))


hvdecile_df <- hv40_plotdf %>%
  #filter(sr==1) %>%
  mutate(decile = ntile(hv40, 10)) %>%           # assign 1–10
  group_by(decile) %>%
  summarise(
    kinlock_pct = mean(kinlock, na.rm = TRUE) * 100
  ) %>%
  ungroup() %>%
  #make a factor so bars are in order
  mutate(
    decile = factor(decile, levels = 1:10, labels = paste0("D", 1:10))
  )

#plot
ggplot(hvdecile_df, aes(x = decile)) +
  # background 0–100 bar
  geom_col(aes(y = 100),
           fill  = "white",
           color = "black",
           width = 0.6) +
  # actual kinlock overlay
  geom_col(aes(y = kinlock_pct),
           fill  = "grey40",
           color = "black",
           width = 0.6) +
  # labels on the dark portion
  # y axis from 0 to ~110% (for label room)
  scale_y_continuous(
    limits = c(0, 110),
    expand = expansion(mult = c(0, 0)),
    labels = label_number(suffix = "%", accuracy = 1)
  ) +
  labs(
    x     = "Home value 1940 decile",
    y     = "% in Kinlock",
    title = "Kinship interlock percentage by 1940 home value within Dallas upper class"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x        = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank()
  )


####################### PERSISTENCE CALCULATIONS######################

##just id and last sr year --- for persistence calculations
idlsr <- ppl %>%
  select(id, lastsryr) %>%
  mutate(id = as.character(id)) %>%
  left_join(hv40c, by = "id")
idkids <- ppl %>%
  select(id, kids) %>%
  mutate(id = as.character(id)) 
idnns <- ppl %>%
  select(id, auntuncles) %>%
  unnest(auntuncles) %>%
  rename("nn" = "id") %>%
  mutate(id = as.character(auntuncles)) %>%
  filter(!is.na(id)) %>%
  select(id, nn) %>%
  unique() 
idkidsnns <- idkids %>%
  left_join(idnns, by = "id") %>%
  mutate(kidnn = map2(kids, nn, c)) %>%
  mutate(id = as.character(id)) %>%
  select(id, kidnn)

##finding desc after 1940 using a series of joins
kinteld <- kinters2 %>%
  filter(byear <= 1880) %>%
  select(id, full, woman, byear, dyear, politicals_kd, socials_kd, economics_kd,  elitegroup) %>%
  left_join(idlsr, by = "id") %>%
  rename("lsr_self" = "lastsryr",
         "h40_self" = "hv40_comm") %>%
  left_join(idkidsnns, by = "id")  %>%
  mutate(kidnn = replace_na(kidnn, "placeholder")) %>%
  unnest(kidnn) %>%
  mutate(kidnn = as.character(kidnn)) %>%
  left_join(idlsr, by = c("kidnn" = "id")) %>%
  rename("lsr_kidnn" = "lastsryr",
         "h40_kidnn" = "hv40_comm") %>%
  left_join(idkids, by = c("kidnn" = "id")) %>%
  rename("gkids" = "kids") %>%
  mutate(gkids = map(gkids, ~if(is.null(.x) || length(.x) == 0 || all(is.na(.x))) list("placeholder") else .x),
         # Ensure every element is a list of characters, even if it is a single value
         gkids = map(gkids, ~map(.x, ~if(!is.character(.x)) as.character(.x) else .x))) %>%
  # Unnest the 'gkids' column to expand lists into individual rows
  unnest(gkids) %>%
  mutate(gkids = as.character(gkids)) %>%
  left_join(idlsr, by = c("gkids" = "id")) %>%
  rename("lsr_gkid" = "lastsryr",
         "h40_gkid" = "hv40_comm") %>%
  left_join(idkids, by = c("gkids" = "id")) %>% 
  rename("ggkids" = "kids") %>% 
  mutate(ggkids = map(ggkids, ~if(is.null(.x) || length(.x) == 0 || all(is.na(.x))) list("placeholder") else .x),
         # Ensure every element is a list of characters, even if it is a single value
         ggkids = map(ggkids, ~map(.x, ~if(!is.character(.x)) as.character(.x) else .x))) %>%
  unnest(ggkids) %>%
  mutate(ggkids = as.character(ggkids)) %>%
  left_join(idlsr, by = c("ggkids" = "id")) %>%
  rename("lsr_ggkid" = "lastsryr",
         "h40_ggkid" = "hv40_comm") %>%
  rowwise() %>%
  mutate(maxsrdesc = pmax(lsr_self, lsr_kidnn, lsr_gkid, lsr_ggkid, na.rm = T),
         maxh40desc = pmax(h40_self, h40_kidnn, h40_gkid, h40_ggkid, na.rm = T)) %>%
  mutate(maxsrdesc2 = ifelse(is.na(maxsrdesc), 0, maxsrdesc),
         maxh40desc2 = ifelse(is.na(maxh40desc), 0, maxh40desc)) %>%
  group_by(id) %>%
  summarise(lastsr_desc = max(maxsrdesc2, na.rm = T),
            maxh40d = max(maxh40desc2, na.rm = T)) %>%
  mutate(h40d = ifelse(maxh40d==0, NA, maxh40d))


###adding some useful variables
kinters3 <- kinters2 %>%
  mutate(sr = ifelse(srsum>1, 1, 0),
         in_kinlock = ifelse(id %in% kinterlocks$id, 1, 0),
         neverdal = case_when(arrive_dallas == "never" ~ 1,
                              TRUE ~ 0))




### i'm not really sure why i'm always making a new spouses dataframe.....
spouses3 <- rel_raw %>%
  drop_na(first1, first2, edgetype) %>%
  filter(p1 != p2,
         edgetype == "Spouse") %>%
  rename(et = edgetype,
         et2 = et_2) %>%
  mutate(et = tolower(et),
         et = gsub("-", "", et, ignore.case = TRUE),
         et = gsub(" ", "", et, ignore.case = TRUE),
         et = gsub("_", "", et, ignore.case = TRUE)) %>%
  mutate(p1_byear = find_birthyear(p1),
         p2_byear = find_birthyear(p2),
         p1_dyear = find_deathyear(p1),
         p2_dyear = find_deathyear(p2),
         younger = pmax(p1_byear, p2_byear),
         diedfirst = pmin(p1_dyear, p2_dyear, na.rm = T),
         startimp = as.numeric(ifelse(is.na(start), younger + 21, start)),
         endimp = as.numeric(ifelse(is.na(end), diedfirst, end))) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  drop_na(p1_byear, p2_byear) %>%
  select(p1, p2, et, startimp, endimp, p1_byear, p2_byear, p1_dyear, p2_dyear) %>%
  rename(start = startimp,
         end = endimp)


###new little df from r2 with specific types that has birth and death years in it
r3 <- rel %>%
  filter(p1 != p2,
         edgetype != "spouse") %>%
  rename(et = edgetype) %>%
  mutate(et = tolower(et),
         et = gsub("-", "", et, ignore.case = TRUE),
         et = gsub(" ", "", et, ignore.case = TRUE),
         et = gsub("_", "", et, ignore.case = TRUE),
         p1_byear = find_birthyear(p1),
         p2_byear = find_birthyear(p2),
         p1_dyear = find_deathyear(p1),
          p2_dyear = find_deathyear(p2)) %>%
  filter(p1 != p2) %>%
  mutate(p1 = as.character(p1),
         p2 = as.character(p2)) %>%
  mutate(p1_dyear = find_deathyear(p1),
         p2_dyear = find_deathyear(p2),
         start = pmax(p1_byear, p2_byear),
         end = pmin(p1_dyear, p2_dyear, na.rm = T)) %>%
  select(colnames(spouses3)) %>%
  add_row(spouses3)

##little dfs of types of elites and when they start being elite
socials_byyear <- ppl %>%
  filter(id %in% socials) %>%
  select(id, byear, deb_year) %>%
  mutate(start = case_when(!is.na(deb_year) ~ deb_year,
                           TRUE ~ byear + 29)) %>%
  select(id, start)

economics_byyear <- ppl %>%
  filter(id %in% economics) %>%
  select(id, byear) %>%
  mutate(start = byear + 29) %>%
  select(id, start)

politicals_byyear <- ppl %>%
  filter(id %in% politicals) %>%
  select(id, byear) %>%
  mutate(start = byear + 29) %>%
  select(id, start)

##list of relationships that will count for kinlock
rellist <- c("parentchild","spouse","auntuncleniecenephew","cousinsfirst","grandparentgrandchild",
             "halfsiblingmat","halfsiblingpat","sibling")


###now making a big df for years
klyrs <- 1881:1950


###ok now first calculation: relationship between being in a kinship interlock in 
#1910 and having descendants after 1940

kl1910 <- klyr(1910) 

kintelders <- kinters2 %>%
  filter(byear <= 1880,
         dyear >=1910,
         left_dallas >= 1910) %>%
  left_join(kinteld, by = "id") %>%
  mutate(anyd40 = ifelse(lastsr_desc > 1940, 1, 0),
         ever_in_kinlock = ifelse(id %in% kinterlocks$id, 1, 0)) %>%
  select(id, woman, last, first, middle, maiden, cluster, kcore, jewish, arrive_dallas,
         left_dallas, lastsr_desc, maxh40d, h40d, anyd40, ever_in_kinlock, srsum, firstsryr) %>%
  left_join(kl1910, by = "id") %>%
  mutate(kinlock = ifelse(is.na(kinlock), 0, kinlock)) %>%
  mutate(econtie = ifelse((!is.na(econ_kd) & econ_kd < 2), 1, 0),
         soctie = ifelse((!is.na(soc_kd) & soc_kd < 2), 1, 0),
         poltie = ifelse((!is.na(pol_kd) & pol_kd < 2), 1, 0)) %>%
  mutate(econ_kd2 = ifelse((!is.na(econ_kd) & econ_kd < 100), econ_kd, NA),
         soc_kd2 = ifelse((!is.na(soc_kd) & soc_kd < 100), soc_kd, NA),
         pol_kd2 = ifelse((!is.na(pol_kd) & pol_kd < 100), pol_kd, NA)) %>%
  filter(firstsryr < 1910) %>%
  mutate(eltype_fill = case_when(elitegroup == "All" ~ "white",
                                 elitegroup == "Economic" ~ "darkgoldenrod1",
                                 elitegroup == "SuperSocial" ~ "dodgerblue2",
                                 elitegroup == "Political" ~ "red2",
                                 elitegroup == "Political-Economic" ~ "darkorange1",
                                 elitegroup == "Social-Economic" ~ "springgreen3",
                                 elitegroup == "Social-Political" ~ "darkviolet",
                                 TRUE ~ "gray84")) %>%
  mutate(kdgroup = case_when(kinlock==1 ~ "KinshipInterlock",
                             poltie==0&soctie==1&econtie==0 ~ "SuperSocialTie",
                             poltie==0&soctie==0&econtie==1 ~ "EconomicTie",
                             poltie==1&soctie==0&econtie==0 ~ "PoliticalTie",
                             poltie==1&soctie==0&econtie==1 ~ "Political-EconomicTie",
                             poltie==0&soctie==1&econtie==1 ~ "Social-EconomicTie",
                             poltie==1&soctie==1&econtie==0 ~ "Social-PoliticalTie",
                             TRUE ~ "NoKnownDirectTies"),
         kdgroup_fill = case_when(kdgroup == "KinshipInterlock" ~ "white",
                                  kdgroup == "EconomicTie" ~ "darkgoldenrod1",
                                  kdgroup== "SuperSocialTie" ~ "dodgerblue2",
                                  kdgroup == "PoliticalTie" ~ "red2",
                                  kdgroup == "Political-EconomicTie" ~ "darkorange1",
                                  kdgroup == "Social-EconomicTie" ~ "springgreen3",
                                  kdgroup == "Social-PoliticalTie" ~ "darkviolet",
                                  kdgroup == "NoKnownDirectTies" ~ "gray84"))


klock10 <- kintelders %>%
  group_by(kdgroup) %>%
  summarise(n = n(),
            perc = (n/nrow(kintelders))*100,
            persisted = mean(anyd40)*100,
            h40d = mean(maxh40d)) %>%
  mutate(kdgroup_fill = case_when(kdgroup == "KinshipInterlock" ~ "white",
                                  kdgroup == "EconomicTie" ~ "darkgoldenrod1",
                                  kdgroup== "SuperSocialTie" ~ "dodgerblue2",
                                  kdgroup == "PoliticalTie" ~ "red2",
                                  kdgroup == "Political-EconomicTie" ~ "darkorange1",
                                  kdgroup == "Social-EconomicTie" ~ "springgreen3",
                                  kdgroup == "Social-PoliticalTie" ~ "darkviolet",
                                  kdgroup == "NoKnownDirectTies" ~ "gray84")) 


#summary dataframe
elgroup_anyd40 <- kintelders %>%
  group_by(elitegroup) %>%
  summarise(n = n(),
            percentage = mean(anyd40)* 100,
            eltype_fill = first(eltype_fill)) %>%
  arrange(desc(percentage)) 

meanpers<- mean(kintelders$anyd40)*100


#persistence for individual elites- for appendix
ggplot(elgroup_anyd40, aes(x = reorder(elitegroup, percentage), y = percentage, fill = eltype_fill)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  labs(title = "Upper-class persistence by individual elite status",
       x = "Individual elite status in 1910",
       y = "Family upper-class persistence after 1940") +
  theme_minimal() +
  geom_hline(yintercept = meanpers, linetype = "dotted", color = "black") +
  annotate("text", x = 1, y = meanpers, 
           label = paste("Overall Mean =", round(meanpers, 1), "%"), color = "black", hjust = -0.1) +
  ylim(0, 100) +
  scale_fill_identity()

###main persistence graph
kdgroup_anyd40 <- kintelders %>%
  group_by(kdgroup) %>%
  summarise(n = n(),
            percentage = mean(anyd40)* 100,
            eltype_fill = first(eltype_fill)) %>%
  arrange(desc(percentage)) %>%
  mutate(kdgroup_fill = case_when(kdgroup == "KinshipInterlock" ~ "white",
                                  kdgroup == "EconomicTie" ~ "darkgoldenrod1",
                                  kdgroup== "SuperSocialTie" ~ "dodgerblue2",
                                  kdgroup == "PoliticalTie" ~ "red2",
                                  kdgroup == "Political-EconomicTie" ~ "darkorange1",
                                  kdgroup == "Social-EconomicTie" ~ "springgreen3",
                                  kdgroup == "Social-PoliticalTie" ~ "darkviolet",
                                  kdgroup == "NoKnownDirectTies" ~ "gray84")) 

ggplot(kdgroup_anyd40, aes(x = reorder(kdgroup, percentage), y = percentage, fill = kdgroup_fill)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  labs(title = "Upper-class persistence by elite kinship",
       x = "Kin configuration 1910",
       y = "Family upper-class persistence after 1940") +
  theme_minimal() +
  geom_hline(yintercept = meanpers, linetype = "dotted", color = "black") +
  annotate("text", x = 1, y = meanpers, 
           label = paste("Overall Mean =", round(meanpers, 1), "%"), color = "black", hjust = -0.1) +
  ylim(0, 100) +
  scale_fill_identity()



##now looking just at ones who aren't elite themselves - for appendix
meanpers_reg<- mean(filter(kintelders, elitegroup=="Regular")$anyd40)*100

klock10_reg <- kintelders %>%
  filter(elitegroup=="Regular") %>%
  group_by(kdgroup) %>%
  summarise(n = n(),
            perc = (n/nrow(kintelders))*100,
            persisted = mean(anyd40)*100,
            h40d = mean(maxh40d),
            h40dmed = median(maxh40d)) %>%
  mutate(kdgroup_fill = case_when(kdgroup == "KinshipInterlock" ~ "white",
                                  kdgroup == "EconomicTie" ~ "darkgoldenrod1",
                                  kdgroup== "SuperSocialTie" ~ "dodgerblue2",
                                  kdgroup == "PoliticalTie" ~ "red2",
                                  kdgroup == "Political-EconomicTie" ~ "darkorange1",
                                  kdgroup == "Social-EconomicTie" ~ "springgreen3",
                                  kdgroup == "Social-PoliticalTie" ~ "darkviolet",
                                  kdgroup == "NoKnownDirectTies" ~ "gray84")) 

ggplot(klock10_reg,  aes(x = reorder(kdgroup, persisted), y = persisted, fill = kdgroup_fill)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  labs(title = "Upper-class persistence for non-elites by direct kin ties to elites",
       x = "Direct ties to elites in 1910",
       y = "Family upper-class persistence after 1940") +
  annotate("text", x = 1, y = meanpers_reg, 
           label = paste("Overall Mean =", round(meanpers_reg, 1), "%"), color = "black", hjust = -0.25) +
  theme_minimal() +
  geom_hline(yintercept = meanpers_reg, linetype = "dotted", color = "black") +
  ylim(0, 100) +
  scale_fill_identity()


###logistic regression for appendix
lsmod2 <- glm(anyd40 ~  kinlock + woman + byear + jewish, 
              data = kintelders, 
              family = "binomial")
summary(lsmod2)
exp(coef(lsmod2))




########### ELITE OVERLAP:::: making venn diagrams for appendix A ############


#making some new columns
kinters3 <- kinters3 %>%
  mutate(
    socelite_only = socelite == 1 & polelite == 0 & econelite == 0,
    polelite_only = socelite == 0 & polelite == 1 & econelite == 0,
    econelite_only = socelite == 0 & polelite == 0 & econelite == 1,
    
    socelite_polelite = socelite == 1 & polelite == 1 & econelite == 0,
    socelite_econelite = socelite == 1 & polelite == 0 & econelite == 1,
    
    polelite_econelite = polelite == 1 & econelite == 1 & socelite == 0,
    
    socelite_polelite_econelite = socelite == 1 & polelite == 1 & econelite == 1
  )

#summarizing counts
comb_counts <- kinters3 %>%
  summarise(
    socelite = sum(socelite_only),
    polelite = sum(polelite_only),
    econelite = sum(econelite_only),
    
    `socelite&polelite` = sum(socelite_polelite),
    `socelite&econelite` = sum(socelite_econelite),
    `polelite&econelite` = sum(polelite_econelite),
    
    `socelite&polelite&econelite` = sum(socelite_polelite_econelite)
  )

#making them numeric vectors
combination_counts <- as.numeric(comb_counts)
names(combination_counts) <- colnames(comb_counts)

#making euler diagram
euler_diagram <- eulerr::euler(combination_counts, control = list(maxiter = 10000, tol = 1e-10), shape="ellipse")

#plotting
plot(
  euler_diagram, 
  fills = c("dodgerblue2", "red2", "darkgoldenrod1"),  # Fill colors
  edges = list(col = "black"),  # Add black edges for better visibility
  labels = list(
    labels = c("Social Elite", "Political Elite", "Economic Elite"),  # Custom labels
    col = "black",              # Label color
    fontsize = 10,              # Font size
    fontface = "bold"           # Font style
  ),
  quantities = TRUE  # Add the count labels for each ellipse and intersection
)

###now making one with four-way overlaps- much more complicated
df_sets <- kinters3 %>%
  transmute(
    `Social Elite`    = socelite == 1,
    `Political Elite` = polelite == 1,
    `Economic Elite`  = econelite == 1,
    `Upper Class`     = (srsum > 0) & (firstsryr < 1945)
  )

venn_list <- lapply(df_sets, function(x) which(x %in% TRUE))


ggVennDiagram(
  venn_list,
  label_alpha = 0,
  edge_size = 0.9
) +
  scale_fill_gradient(
    low = "#F7FBFF",
    high = "steelblue",
    name = "Count"
  ) +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", size = 12)
  ) +
  ggtitle("Overlap between economic elites, political elites, social elites, and the upper class from 1895-1945 in the dataset")



